From 54e36bc94fdc9e696c261be2c71cf23500db0d58 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Fri, 15 Jan 2016 16:57:37 -0500 Subject: [PATCH 01/12] MAINT: create abstract events class. --- zipline/pipeline/data/buyback_auth.py | 1 + zipline/pipeline/loaders/buyback_auth.py | 1 + zipline/pipeline/loaders/events.py | 62 ++++++++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 zipline/pipeline/data/buyback_auth.py create mode 100644 zipline/pipeline/loaders/buyback_auth.py create mode 100644 zipline/pipeline/loaders/events.py diff --git a/zipline/pipeline/data/buyback_auth.py b/zipline/pipeline/data/buyback_auth.py new file mode 100644 index 00000000..8710fd29 --- /dev/null +++ b/zipline/pipeline/data/buyback_auth.py @@ -0,0 +1 @@ +__author__ = 'mtydykov' diff --git a/zipline/pipeline/loaders/buyback_auth.py b/zipline/pipeline/loaders/buyback_auth.py new file mode 100644 index 00000000..8710fd29 --- /dev/null +++ b/zipline/pipeline/loaders/buyback_auth.py @@ -0,0 +1 @@ +__author__ = 'mtydykov' diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py new file mode 100644 index 00000000..1e6fe7e0 --- /dev/null +++ b/zipline/pipeline/loaders/events.py @@ -0,0 +1,62 @@ +from abc import abstractmethod +from itertools import repeat + +import pandas as pd +from six import iteritems +from toolz import merge + +from .base import PipelineLoader +from .frame import DataFrameLoader +from zipline.utils.memoize import lazyval + + +class EventsLoader(PipelineLoader): + def __init__(self, + all_dates, + announcement_dates, + infer_timestamps=False, + dataset=None): + self.all_dates = all_dates + self.announcement_dates = ( + announcement_dates.copy() + ) + dates = self.all_dates.values + for k, v in iteritems(announcement_dates): + if isinstance(v, pd.DatetimeIndex): + if not infer_timestamps: + raise ValueError( + "Got DatetimeIndex of announcement dates for sid %d.\n" + "Pass `infer_timestamps=True` to use the first date in" + " `all_dates` as implicit timestamp." + ) + # If we are passed a DatetimeIndex, we always have + # knowledge of the announcements. + announcement_dates[k] = pd.Series( + v, index=repeat(dates[0], len(v)), + ) + self.dataset = dataset + + + @abstractmethod + def get_loader(self): + raise NotImplementedError("EventsLoader must implement 'get_loader'.") + + + def load_adjusted_array(self, columns, dates, assets, mask): + return merge( + self.get_loader(column).load_adjusted_array( + [column], dates, assets, mask + ) + for column in columns + ) + + @lazyval + def date_frame_loader(self, col_name, next_or_prev): + return DataFrameLoader( + col_name, + next_or_prev( + self.all_dates, + self.announcement_dates, + ), + adjustments=None, + ) From 534f820a9128a19f3a03ef9ea252e4c1a9777d85 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Fri, 15 Jan 2016 16:57:51 -0500 Subject: [PATCH 02/12] ENH: add buyback_auth dataset. --- zipline/pipeline/data/buyback_auth.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/zipline/pipeline/data/buyback_auth.py b/zipline/pipeline/data/buyback_auth.py index 8710fd29..dd58e9b2 100644 --- a/zipline/pipeline/data/buyback_auth.py +++ b/zipline/pipeline/data/buyback_auth.py @@ -1 +1,17 @@ -__author__ = 'mtydykov' +""" +Dataset representing dates of upcoming earnings. +""" +from zipline.utils.numpy_utils import datetime64ns_dtype, float64_dtype + +from .dataset import Column, DataSet + + +class BuybackAuthorizations(DataSet): + """ + Dataset representing dates of recently announced buyback authorization. + """ + previous_buyback_value = Column(float64_dtype) + previous_buyback_share_count = Column(float64_dtype) + previous_buyback_value_announcement = Column(datetime64ns_dtype) + previous_buyback_share_count_announcement = Column(datetime64ns_dtype) + From 3142fa516fb190eede217f7d7a34ba35b87ce422 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Fri, 15 Jan 2016 16:58:04 -0500 Subject: [PATCH 03/12] ENH: add buyback_auth loader. WIP: finish refactoring blaze events loader. WIP: tests passing for earnings. BUG: pass all kwargs explicitly for BlazeEventsCalendarLoader. If this is not done, resources are not bound correctly. MAINT: refactor for buyback_auth. --- tests/pipeline/test_buyback_auth.py | 404 ++++++++++++++++++ tests/pipeline/test_earnings.py | 72 ++-- zipline/pipeline/data/__init__.py | 3 + zipline/pipeline/data/buyback_auth.py | 12 +- zipline/pipeline/factors/events.py | 112 +++-- zipline/pipeline/loaders/blaze/__init__.py | 14 + .../pipeline/loaders/blaze/buyback_auth.py | 134 ++++++ zipline/pipeline/loaders/blaze/core.py | 1 + zipline/pipeline/loaders/blaze/earnings.py | 98 +---- zipline/pipeline/loaders/blaze/events.py | 120 ++++++ zipline/pipeline/loaders/buyback_auth.py | 123 +++++- zipline/pipeline/loaders/earnings.py | 92 +--- zipline/pipeline/loaders/events.py | 103 ++++- zipline/pipeline/loaders/utils.py | 60 ++- 14 files changed, 1090 insertions(+), 258 deletions(-) create mode 100644 tests/pipeline/test_buyback_auth.py create mode 100644 zipline/pipeline/loaders/blaze/buyback_auth.py create mode 100644 zipline/pipeline/loaders/blaze/events.py diff --git a/tests/pipeline/test_buyback_auth.py b/tests/pipeline/test_buyback_auth.py new file mode 100644 index 00000000..c76167ae --- /dev/null +++ b/tests/pipeline/test_buyback_auth.py @@ -0,0 +1,404 @@ +""" +Tests for the reference loader for EarningsCalendar. +""" +from unittest import TestCase + +import blaze as bz +from blaze.compute.core import swap_resources_into_scope +from contextlib2 import ExitStack +from nose_parameterized import parameterized +import pandas as pd +import numpy as np +from pandas.util.testing import assert_series_equal +from six import iteritems + +from zipline.pipeline import Pipeline +from zipline.pipeline.data import (CashBuybackAuthorizations, + ShareBuybackAuthorizations) +from zipline.pipeline.engine import SimplePipelineEngine +from zipline.pipeline.factors.events import ( + BusinessDaysSincePreviousCashBuybackAuth, + BusinessDaysSincePreviousShareBuybackAuth +) +from zipline.pipeline.loaders.buyback_auth import \ + CashBuybackAuthorizationsLoader, ShareBuybackAuthorizationsLoader +from zipline.pipeline.loaders.blaze import ( + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + CashBuybackAuthorizationsLoader, + SHARE_COUNT_FIELD_NAME, + SID_FIELD_NAME, + ShareBuybackAuthorizationsLoader, + TS_FIELD_NAME, + VALUE_FIELD_NAME +) +from zipline.utils.numpy_utils import make_datetime64D, np_NaT +from zipline.utils.test_utils import ( + make_simple_equity_info, + tmp_asset_finder, + gen_calendars, + num_days_in_range, +) + + +sids = A, B, C, D, E = range(5) + +equity_info = make_simple_equity_info( + sids, + start_date=pd.Timestamp('2013-01-01', tz='UTC'), + end_date=pd.Timestamp('2015-01-01', tz='UTC'), + ) + +buyback_authorizations = { + # K1--K2--A1--A2--SC1--SC2--V1--V2. + A: pd.DataFrame({ + "timestamp": pd.to_datetime(['2014-01-05', '2014-01-10']), + BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-15', + '2014-01-20']), + SHARE_COUNT_FIELD_NAME: [1, 15], + VALUE_FIELD_NAME: [10, 20] + }), + # K1--K2--E2--E1. + B: pd.DataFrame({ + "timestamp": pd.to_datetime(['2014-01-05', '2014-01-10']), + BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ + '2014-01-20', '2014-01-15']), + SHARE_COUNT_FIELD_NAME: [7, 13], VALUE_FIELD_NAME: [10, 22] + }), + # K1--E1--K2--E2. + C: pd.DataFrame({ + "timestamp": pd.to_datetime(['2014-01-05', '2014-01-15']), + BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ + '2014-01-10', '2014-01-20']), + SHARE_COUNT_FIELD_NAME: [3, 1], + VALUE_FIELD_NAME: [4, 7] + }), + # K1 == K2. + D: pd.DataFrame({ + "timestamp": pd.to_datetime(['2014-01-05'] * 2), + BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ + '2014-01-10', '2014-01-15']), + SHARE_COUNT_FIELD_NAME: [6, 23], + VALUE_FIELD_NAME: [1, 2] + }), + E: pd.DataFrame( + columns=["timestamp", + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + SHARE_COUNT_FIELD_NAME, + VALUE_FIELD_NAME], + dtype='datetime64[ns]' + ), + } + +param_dates = gen_calendars( + '2014-01-01', + '2014-01-31', + critical_dates=pd.to_datetime([ + '2014-01-05', + '2014-01-10', + '2014-01-15', + '2014-01-20', + ]), + ) + + +def zip_with_floats(flts, dates): + return pd.Series(flts, index=dates).astype('float') + + +def num_days_between(dates, start_date, end_date): + return num_days_in_range(dates, start_date, end_date) + + +def zip_with_dates(dts, dates): + return pd.Series(pd.to_datetime(dts), index=dates) + + +class BuybackAuthLoaderTestCase(TestCase): + """ + Tests for loading the earnings announcement data. + """ + + @classmethod + def setUpClass(cls): + cls._cleanup_stack = stack = ExitStack() + + cls.finder = stack.enter_context( + tmp_asset_finder(equities=equity_info), + ) + cls.cols = {} + cls.buyback_authorizations = None + + + @classmethod + def tearDownClass(cls): + cls._cleanup_stack.close() + + def loader_args(self, dates): + """Construct the base buyback authorizations object to pass to the + loader. + + Parameters + ---------- + dates : pd.DatetimeIndex + The dates we can serve. + + Returns + ------- + args : tuple[any] + The arguments to forward to the loader positionally. + """ + return dates, self.buyback_authorizations + + def setup(self, dates): + """ + Make a PipelineEngine and expectation functions for the given dates + calendar. + + This exists to make it easy to test our various cases with critical + dates missing from the calendar. + """ + + _expected_previous_buyback_announcement = pd.DataFrame({ + A: zip_with_dates( + ['NaT'] * num_days_between(dates, None, '2014-01-14') + + ['2014-01-15'] * num_days_between(dates, '2014-01-15', '2014-01-19') + + ['2014-01-20'] * num_days_between(dates, '2014-01-20', None), + dates + ), + B: zip_with_dates( + ['NaT'] * num_days_between(dates, None, '2014-01-14') + + ['2014-01-15'] * num_days_between(dates, '2014-01-15', '2014-01-19') + + ['2014-01-20'] * num_days_between(dates, '2014-01-20', None), + dates + ), + C: zip_with_dates( + ['NaT'] * num_days_between(dates, None, '2014-01-09') + + ['2014-01-10'] * num_days_between(dates, '2014-01-10', '2014-01-19') + + ['2014-01-20'] * num_days_between(dates, '2014-01-20', None), + dates + ), + D: zip_with_dates( + ['NaT'] * num_days_between(dates, None, '2014-01-09') + + ['2014-01-10'] * num_days_between(dates, '2014-01-10', '2014-01-14') + + ['2014-01-15'] * num_days_between(dates, '2014-01-15', None), + dates + ), + E: zip_with_dates(['NaT'] * len(dates), dates), + }, index=dates) + + _expected_previous_busday_offsets = self._compute_busday_offsets( + _expected_previous_buyback_announcement + ) + + self.cols['previous_buyback_announcement'] = _expected_previous_buyback_announcement + self.cols['days_since_prev'] = _expected_previous_busday_offsets + + loader = self.loader_type(*self.loader_args(dates)) + engine = SimplePipelineEngine(lambda _: loader, dates, self.finder) + return engine + + @staticmethod + def _compute_busday_offsets(announcement_dates): + """ + Compute expected business day offsets from a DataFrame of announcement + dates. + """ + # Column-vector of dates on which factor `compute` will be called. + raw_call_dates = announcement_dates.index.values.astype( + 'datetime64[D]' + )[:, None] + + # 2D array of dates containining expected nexg announcement. + raw_announce_dates = ( + announcement_dates.values.astype('datetime64[D]') + ) + + # Set NaTs to 0 temporarily because busday_count doesn't support NaT. + # We fill these entries with NaNs later. + whereNaT = raw_announce_dates == np_NaT + raw_announce_dates[whereNaT] = make_datetime64D(0) + + # The abs call here makes it so that we can use this function to + # compute offsets for both next and previous earnings (previous + # earnings offsets come back negative). + expected = abs(np.busday_count( + raw_call_dates, + raw_announce_dates + ).astype(float)) + + expected[whereNaT] = np.nan + return pd.DataFrame( + data=expected, + columns=announcement_dates.columns, + index=announcement_dates.index, + ) + + def _test_compute_buyback_auth(self, dates): + engine = self.setup(dates) + + pipe = Pipeline( + columns=self.pipeline_columns + ) + + result = engine.run_pipeline( + pipe, + start_date=dates[0], + end_date=dates[-1], + ) + + for sid in sids: + for col_name in self.cols.keys(): + assert_series_equal(result[col_name].xs(sid, level=1), + self.cols[col_name][sid], + sid) + + +class ShareBuybackAuthLoaderTestCase(BuybackAuthLoaderTestCase): + buyback_authorizations = {sid: df.drop(VALUE_FIELD_NAME, 1) + for sid, df in iteritems(buyback_authorizations)} + pipeline_columns = { + 'previous_buyback_share_count': + ShareBuybackAuthorizations.previous_share_count.latest, + 'previous_buyback_announcement': + ShareBuybackAuthorizations.previous_announcement_date.latest, + 'days_since_prev': + BusinessDaysSincePreviousShareBuybackAuth(), + } + + @classmethod + def setUpClass(cls): + super(ShareBuybackAuthLoaderTestCase, cls).setUpClass() + cls.buyback_authorizations = buyback_authorizations + cls.loader_type = ShareBuybackAuthorizationsLoader + + def setup(self, dates): + engine = super(ShareBuybackAuthLoaderTestCase, self).setup(dates) + _expected_previous_buyback_share_count = pd.DataFrame({ + A: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-14') + + [1] * num_days_between(dates, '2014-01-15', '2014-01-19') + + [15] * num_days_between(dates, '2014-01-20', None), dates), + B: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-14') + + [13] * num_days_between(dates, '2014-01-15', '2014-01-19') + + [7] * num_days_between(dates, '2014-01-20', None), dates), + C: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-09') + + [3] * num_days_between(dates, '2014-01-10', '2014-01-19') + + [1] * num_days_between(dates, '2014-01-20', None), dates), + D: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-09') + + [6] * num_days_between(dates, '2014-01-10', '2014-01-14') + + [23] * num_days_between(dates, '2014-01-15', None), dates), + E: zip_with_floats(['NaN'] * len(dates), dates), + }, index=dates) + self.cols['previous_buyback_share_count'] = _expected_previous_buyback_share_count + return engine + + @parameterized.expand(param_dates) + def test_compute_buyback_auth(self, dates): + self._test_compute_buyback_auth(dates) + + +class CashBuybackAuthLoaderTestCase(BuybackAuthLoaderTestCase): + buyback_authorizations = {sid: df.drop(SHARE_COUNT_FIELD_NAME, 1) + for sid, df in iteritems(buyback_authorizations)} + pipeline_columns = { + 'previous_buyback_value': + CashBuybackAuthorizations.previous_value.latest, + 'previous_buyback_announcement': + CashBuybackAuthorizations.previous_announcement_date.latest, + 'days_since_prev': + BusinessDaysSincePreviousCashBuybackAuth(), + } + + @classmethod + def setUpClass(cls): + super(CashBuybackAuthLoaderTestCase, cls).setUpClass() + cls.buyback_authorizations = buyback_authorizations + cls.loader_type = CashBuybackAuthLoaderTestCase + + def setup(self, dates): + engine = super(ShareBuybackAuthLoaderTestCase, self).setup(dates) + _expected_previous_value = pd.DataFrame({ + # TODO if the next knowledge date is 10, why is the range + # until 15? + A: zip_with_floats( + ['NaN'] * num_days_between(dates, None, '2014-01-14') + + [10] * num_days_between(dates, '2014-01-15', '2014-01-19') + + [20] * num_days_between(dates, '2014-01-20', None), dates), + B: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-14') + + [22] * num_days_between(dates, '2014-01-15', '2014-01-19') + + [10] * num_days_between(dates, '2014-01-20', None), dates), + C: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-09') + + [4] * num_days_between(dates, '2014-01-10', '2014-01-19') + + [7] * num_days_between(dates, '2014-01-20', None), dates), + D: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-09') + + [1] * num_days_between(dates, '2014-01-10', '2014-01-14') + + [2] * num_days_between(dates, '2014-01-15', None), dates), + E: zip_with_floats(['NaN'] * len(dates), dates), + }, index=dates) + self.cols['previous_buyback_value'] = _expected_previous_value + return engine + + @parameterized.expand(param_dates) + def test_compute_buyback_auth(self, dates): + self._test_compute_buyback_auth(dates) + + +# class BlazeBuybackAuthLoaderTestCase(BuybackAuthLoaderTestCase): +# loader_type = BlazeBuybackAuthorizationsLoader +# +# def loader_args(self, dates): +# _, mapping = super( +# BlazeBuybackAuthLoaderTestCase, +# self, +# ).loader_args(dates) +# return (bz.Data(pd.concat( +# pd.DataFrame({ +# BUYBACK_ANNOUNCEMENT_FIELD_NAME: +# frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME], +# SHARE_COUNT_FIELD_NAME: frame[SHARE_COUNT_FIELD_NAME], +# VALUE_FIELD_NAME: frame[VALUE_FIELD_NAME], +# TS_FIELD_NAME: frame.index, +# SID_FIELD_NAME: sid, +# }) +# for sid, frame in iteritems(mapping) +# ).reset_index(drop=True)),) +# +# +# class BlazeEarningsCalendarLoaderNotInteractiveTestCase( +# BlazeBuybackAuthLoaderTestCase): +# """Test case for passing a non-interactive symbol and a dict of resources. +# """ +# def loader_args(self, dates): +# (bound_expr,) = super( +# BlazeEarningsCalendarLoaderNotInteractiveTestCase, +# self, +# ).loader_args(dates) +# return swap_resources_into_scope(bound_expr, {}) +# +# +# class BuybackAuthLoaderInferTimestampTestCase(TestCase): +# def test_infer_timestamp(self): +# dtx = pd.date_range('2014-01-01', '2014-01-10') +# events_by_sid = { +# 0: pd.DataFrame({BUYBACK_ANNOUNCEMENT_FIELD_NAME: dtx}), +# 1: pd.DataFrame( +# {BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.Series(dtx, dtx)}, +# index=dtx +# ) +# } +# loader = BuybackAuthorizationsLoader( +# dtx, +# events_by_sid, +# infer_timestamps=True, +# ) +# self.assertEqual( +# loader.events_by_sid.keys(), +# events_by_sid.keys(), +# ) +# assert_series_equal( +# loader.events_by_sid[0][BUYBACK_ANNOUNCEMENT_FIELD_NAME], +# pd.Series(index=[dtx[0]] * 10, data=dtx), +# ) +# assert_series_equal( +# loader.events_by_sid[1][BUYBACK_ANNOUNCEMENT_FIELD_NAME], +# events_by_sid[1][BUYBACK_ANNOUNCEMENT_FIELD_NAME], +# ) diff --git a/tests/pipeline/test_earnings.py b/tests/pipeline/test_earnings.py index 57b72420..4f78fb68 100644 --- a/tests/pipeline/test_earnings.py +++ b/tests/pipeline/test_earnings.py @@ -57,30 +57,33 @@ class EarningsCalendarLoaderTestCase(TestCase): cls.earnings_dates = { # K1--K2--E1--E2. - A: to_series( - knowledge_dates=['2014-01-05', '2014-01-10'], - earning_dates=['2014-01-15', '2014-01-20'], - ), + A: pd.DataFrame({ + "timestamp": pd.to_datetime(['2014-01-05', '2014-01-10']), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-15', + '2014-01-20']) + }), # K1--K2--E2--E1. - B: to_series( - knowledge_dates=['2014-01-05', '2014-01-10'], - earning_dates=['2014-01-20', '2014-01-15'] - ), + B: pd.DataFrame({ + "timestamp": pd.to_datetime(['2014-01-05', '2014-01-10']), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-20', + '2014-01-15']) + }), # K1--E1--K2--E2. - C: to_series( - knowledge_dates=['2014-01-05', '2014-01-15'], - earning_dates=['2014-01-10', '2014-01-20'] - ), + C: pd.DataFrame({ + "timestamp": pd.to_datetime(['2014-01-05', '2014-01-15']), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', + '2014-01-20']) + }), # K1 == K2. - D: to_series( - knowledge_dates=['2014-01-05'] * 2, - earning_dates=['2014-01-10', '2014-01-15'], - ), - E: pd.Series( - data=[], - index=pd.DatetimeIndex([]), - dtype='datetime64[ns]', - ), + D: pd.DataFrame({ + "timestamp": pd.to_datetime(['2014-01-05'] * 2), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', + '2014-01-15']) + }), + E: pd.DataFrame({ + "timestamp": pd.to_datetime([]), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([]) + }) } @classmethod @@ -118,7 +121,8 @@ class EarningsCalendarLoaderTestCase(TestCase): def zip_with_dates(dts): return pd.Series(pd.to_datetime(dts), index=dates) - + # TODO: tests will break because I now need mappings of sid -> + # dataframe instead of sid -> series _expected_next_announce = pd.DataFrame({ A: zip_with_dates( ['NaT'] * num_days_between(None, '2014-01-04') + @@ -345,11 +349,11 @@ class BlazeEarningsCalendarLoaderTestCase(EarningsCalendarLoaderTestCase): ).loader_args(dates) return (bz.Data(pd.concat( pd.DataFrame({ - ANNOUNCEMENT_FIELD_NAME: earning_dates, - TS_FIELD_NAME: earning_dates.index, + ANNOUNCEMENT_FIELD_NAME: df[ANNOUNCEMENT_FIELD_NAME], + TS_FIELD_NAME: df[TS_FIELD_NAME], SID_FIELD_NAME: sid, }) - for sid, earning_dates in iteritems(mapping) + for sid, df in iteritems(mapping) ).reset_index(drop=True)),) @@ -369,8 +373,8 @@ class EarningsCalendarLoaderInferTimestampTestCase(TestCase): def test_infer_timestamp(self): dtx = pd.date_range('2014-01-01', '2014-01-10') announcement_dates = { - 0: dtx, - 1: pd.Series(dtx, dtx), + 0: pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx}), + 1: pd.DataFrame({TS_FIELD_NAME: dtx, ANNOUNCEMENT_FIELD_NAME: dtx}), } loader = EarningsCalendarLoader( dtx, @@ -378,14 +382,18 @@ class EarningsCalendarLoaderInferTimestampTestCase(TestCase): infer_timestamps=True, ) self.assertEqual( - loader.announcement_dates.keys(), + loader.events_by_sid.keys(), announcement_dates.keys(), ) assert_series_equal( - loader.announcement_dates[0], - pd.Series(index=[dtx[0]] * 10, data=dtx), + pd.Series(loader.events_by_sid[0][ANNOUNCEMENT_FIELD_NAME]), + pd.Series(index=[dtx[0]] * 10, data=dtx, + name=ANNOUNCEMENT_FIELD_NAME), ) assert_series_equal( - loader.announcement_dates[1], - announcement_dates[1], + pd.Series(loader.events_by_sid[1][ANNOUNCEMENT_FIELD_NAME]), + pd.Series(index=announcement_dates[1][TS_FIELD_NAME], + data=np.array(announcement_dates[1][ + ANNOUNCEMENT_FIELD_NAME]), + name=ANNOUNCEMENT_FIELD_NAME) ) diff --git a/zipline/pipeline/data/__init__.py b/zipline/pipeline/data/__init__.py index 894bc86e..f7357f09 100644 --- a/zipline/pipeline/data/__init__.py +++ b/zipline/pipeline/data/__init__.py @@ -1,11 +1,14 @@ +from .buyback_auth import CashBuybackAuthorizations, ShareBuybackAuthorizations from .earnings import EarningsCalendar from .equity_pricing import USEquityPricing from .dataset import DataSet, Column, BoundColumn __all__ = [ 'BoundColumn', + 'CashBuybackAuthorizations', 'Column', 'DataSet', 'EarningsCalendar', + 'ShareBuybackAuthorizations', 'USEquityPricing', ] diff --git a/zipline/pipeline/data/buyback_auth.py b/zipline/pipeline/data/buyback_auth.py index dd58e9b2..7c1cf952 100644 --- a/zipline/pipeline/data/buyback_auth.py +++ b/zipline/pipeline/data/buyback_auth.py @@ -6,12 +6,14 @@ from zipline.utils.numpy_utils import datetime64ns_dtype, float64_dtype from .dataset import Column, DataSet -class BuybackAuthorizations(DataSet): +class CashBuybackAuthorizations(DataSet): """ Dataset representing dates of recently announced buyback authorization. """ - previous_buyback_value = Column(float64_dtype) - previous_buyback_share_count = Column(float64_dtype) - previous_buyback_value_announcement = Column(datetime64ns_dtype) - previous_buyback_share_count_announcement = Column(datetime64ns_dtype) + previous_value = Column(float64_dtype) + previous_announcement_date = Column(datetime64ns_dtype) + +class ShareBuybackAuthorizations(DataSet): + previous_share_count = Column(float64_dtype) + previous_announcement_date = Column(datetime64ns_dtype) diff --git a/zipline/pipeline/factors/events.py b/zipline/pipeline/factors/events.py index 2491efc8..481e0e8a 100644 --- a/zipline/pipeline/factors/events.py +++ b/zipline/pipeline/factors/events.py @@ -3,6 +3,10 @@ Factors describing information about event data (e.g. earnings announcements, acquisitions, dividends, etc.). """ from numpy import newaxis +from zipline.pipeline.data.buyback_auth import ( + CashBuybackAuthorizations, + ShareBuybackAuthorizations +) from zipline.pipeline.data.earnings import EarningsCalendar from zipline.utils.numpy_utils import ( NaTD, @@ -14,10 +18,42 @@ from zipline.utils.numpy_utils import ( from .factor import Factor -class BusinessDaysUntilNextEarnings(Factor): +class BusinessDaysSincePreviousEvents(Factor): """ - Factor returning the number of **business days** (not trading days!) until - the next known earnings date for each asset. + Abstract class for business days since a previous event. + Returns the number of **business days** (not trading days!) since + the most recent event date for each asset. + + This doesn't use trading days for symmetry with + BusinessDaysUntilNextEarnings. + + Assets which announced or will announce the event today will produce a value + of 0.0. Assets that announced the event on the previous business day will + produce a value of 1.0. + + Assets for which the event date is `NaT` will produce a value of `NaN`. + """ + window_length = 0 + dtype = float64_dtype + + def _compute(self, arrays, dates, assets, mask): + + # Coerce from [ns] to [D] for numpy busday_count. + announce_dates = arrays[0].astype(datetime64D_dtype) + + # Set masked values to NaT. + announce_dates[~mask] = np_NaT + + # Convert row labels into a column vector for broadcasted comparison. + reference_dates = dates.values.astype(datetime64D_dtype)[:, newaxis] + return busday_count_mask_NaT(announce_dates, reference_dates) + + +class BusinessDaysUntilNextEvents(Factor): + """ + Abstract class for business days since a next event. + Returns the number of **business days** (not trading days!) until + the next known event date for each asset. This doesn't use trading days because the trading calendar includes information that may not have been available to the algorithm at the time @@ -26,19 +62,12 @@ class BusinessDaysUntilNextEarnings(Factor): For example, the NYSE closings September 11th 2001, would not have been known to the algorithm on September 10th. - Assets that announced or will announce earnings today will produce a value - of 0.0. Assets that will announce earnings on the next upcoming business + Assets that announced or will announce the event today will produce a value + of 0.0. Assets that will announce the event on the next upcoming business day will produce a value of 1.0. - Assets for which `EarningsCalendar.next_announcement` is `NaT` will produce - a value of `NaN`. - - - See Also - -------- - zipline.pipeline.factors.BusinessDaysSincePreviousEarnings + Assets for which the event date is `NaT` will produce a value of `NaN`. """ - inputs = [EarningsCalendar.next_announcement] window_length = 0 dtype = float64_dtype @@ -55,37 +84,52 @@ class BusinessDaysUntilNextEarnings(Factor): return busday_count_mask_NaT(reference_dates, announce_dates) -class BusinessDaysSincePreviousEarnings(Factor): +class BusinessDaysUntilNextEarnings(BusinessDaysUntilNextEvents): + """ + Factor returning the number of **business days** (not trading days!) until + the next known earnings date for each asset. + + See Also + -------- + zipline.pipeline.factors.BusinessDaysSincePreviousEarnings + """ + inputs = [EarningsCalendar.next_announcement] + + +class BusinessDaysSincePreviousEarnings(BusinessDaysSincePreviousEvents): """ Factor returning the number of **business days** (not trading days!) since the most recent earnings date for each asset. - This doesn't use trading days for symmetry with - BusinessDaysUntilNextEarnings. - - Assets which announced or will announce earnings today will produce a value - of 0.0. Assets that announced earnings on the previous business day will - produce a value of 1.0. - - Assets for which `EarningsCalendar.previous_announcement` is `NaT` will - produce a value of `NaN`. - See Also -------- zipline.pipeline.factors.BusinessDaysUntilNextEarnings """ inputs = [EarningsCalendar.previous_announcement] - window_length = 0 - dtype = float64_dtype - def _compute(self, arrays, dates, assets, mask): - # Coerce from [ns] to [D] for numpy busday_count. - announce_dates = arrays[0].astype(datetime64D_dtype) +class BusinessDaysSincePreviousCashBuybackAuth(BusinessDaysSincePreviousEvents): + """ + Factor returning the number of **business days** (not trading days!) since + the most recent cash buyback authorization for each asset. - # Set masked values to NaT. - announce_dates[~mask] = NaTD + See Also + -------- + zipline.pipeline.factors.BusinessDaysUntilNextEarnings + """ + inputs = [CashBuybackAuthorizations.previous_announcement_date] - # Convert row labels into a column vector for broadcasted comparison. - reference_dates = dates.values.astype(datetime64D_dtype)[:, newaxis] - return busday_count_mask_NaT(announce_dates, reference_dates) + +class BusinessDaysSincePreviousShareBuybackAuth( + BusinessDaysSincePreviousEvents +): + """ + Factor returning the number of **business days** (not trading days!) since + the most recent share buyback authorization for each asset. + + + See Also + -------- + zipline.pipeline.factors.BusinessDaysUntilNextEarnings + """ + inputs = [ShareBuybackAuthorizations.previous_announcement_date] diff --git a/zipline/pipeline/loaders/blaze/__init__.py b/zipline/pipeline/loaders/blaze/__init__.py index 200fa583..9702d5ea 100644 --- a/zipline/pipeline/loaders/blaze/__init__.py +++ b/zipline/pipeline/loaders/blaze/__init__.py @@ -1,3 +1,7 @@ +from .buyback_auth import ( + CashBuybackAuthorizationsLoader, + ShareBuybackAuthorizationsLoader +) from .core import ( AD_FIELD_NAME, BlazeLoader, @@ -7,6 +11,11 @@ from .core import ( from_blaze, global_loader, ) +from .buyback_auth import ( + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + SHARE_COUNT_FIELD_NAME, + VALUE_FIELD_NAME +) from .earnings import ( ANNOUNCEMENT_FIELD_NAME, BlazeEarningsCalendarLoader, @@ -17,9 +26,14 @@ __all__ = ( 'ANNOUNCEMENT_FIELD_NAME', 'BlazeEarningsCalendarLoader', 'BlazeLoader', + 'BUYBACK_ANNOUNCEMENT_FIELD_NAME', + 'CashBuybackAuthorizationsLoader', 'NoDeltasWarning', + 'SHARE_COUNT_FIELD_NAME', 'SID_FIELD_NAME', + 'ShareBuybackAuthorizationsLoader', 'TS_FIELD_NAME', + 'VALUE_FIELD_NAME', 'from_blaze', 'global_loader', ) diff --git a/zipline/pipeline/loaders/blaze/buyback_auth.py b/zipline/pipeline/loaders/blaze/buyback_auth.py new file mode 100644 index 00000000..b98a03aa --- /dev/null +++ b/zipline/pipeline/loaders/blaze/buyback_auth.py @@ -0,0 +1,134 @@ +from .core import ( + TS_FIELD_NAME, + SID_FIELD_NAME, +) +from zipline.pipeline.data import (CashBuybackAuthorizations, + ShareBuybackAuthorizations) +from zipline.pipeline.loaders.buyback_auth import ( + CashBuybackAuthorizationsLoader, + ShareBuybackAuthorizationsLoader +) +from .events import BlazeEventsCalendarLoader + + +BUYBACK_ANNOUNCEMENT_FIELD_NAME = 'buyback_dates' +SHARE_COUNT_FIELD_NAME = 'share_counts' +VALUE_FIELD_NAME = 'values' + + +class BlazeCashBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): + """A pipeline loader for the ``BuybackAuth`` dataset that loads + data from a blaze expression. + + Parameters + ---------- + expr : Expr + The expression representing the data to load. + resources : dict, optional + Mapping from the atomic terms of ``expr`` to actual data resources. + odo_kwargs : dict, optional + Extra keyword arguments to pass to odo when executing the expression. + data_query_time : time, optional + The time to use for the data query cutoff. + data_query_tz : tzinfo or str + The timezeone to use for the data query cutoff. + + Notes + ----- + The expression should have a tabular dshape of:: + + Dim * {{ + {SID_FIELD_NAME}: int64, + {TS_FIELD_NAME}: datetime, + {BUYBACK_ANNOUNCEMENT_FIELD_NAME}: ?datetime, + {VALUE_FIELD_NAME}: ?float64 + }} + + Where each row of the table is a record including the sid to identify the + company, the timestamp where we learned about the announcement, the + date when the buyback was announced, the share count, and the value. + + If the '{TS_FIELD_NAME}' field is not included it is assumed that we + start the backtest with knowledge of all announcements. + """ + __doc__ = __doc__.format( + TS_FIELD_NAME=TS_FIELD_NAME, + SID_FIELD_NAME=SID_FIELD_NAME, + BUYBACK_ANNOUNCEMENT_FIELD_NAME=BUYBACK_ANNOUNCEMENT_FIELD_NAME, + VALUE_FIELD_NAME=VALUE_FIELD_NAME + ) + + _expected_fields = frozenset({ + TS_FIELD_NAME, + SID_FIELD_NAME, + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + VALUE_FIELD_NAME + }) + + def __init__(self, + expr, + dataset=CashBuybackAuthorizations, + loader=CashBuybackAuthorizationsLoader, + **kwargs): + super( + BlazeCashBuybackAuthorizationsLoader, self + ).__init__(expr, dataset=dataset, loader=loader, **kwargs) + + +class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): + """A pipeline loader for the ``BuybackAuth`` dataset that loads + data from a blaze expression. + + Parameters + ---------- + expr : Expr + The expression representing the data to load. + resources : dict, optional + Mapping from the atomic terms of ``expr`` to actual data resources. + odo_kwargs : dict, optional + Extra keyword arguments to pass to odo when executing the expression. + data_query_time : time, optional + The time to use for the data query cutoff. + data_query_tz : tzinfo or str + The timezeone to use for the data query cutoff. + + Notes + ----- + The expression should have a tabular dshape of:: + + Dim * {{ + {SID_FIELD_NAME}: int64, + {TS_FIELD_NAME}: datetime, + {BUYBACK_ANNOUNCEMENT_FIELD_NAME}: ?datetime, + {SHARE_COUNT_FIELD_NAME}: ?float64, + }} + + Where each row of the table is a record including the sid to identify the + company, the timestamp where we learned about the announcement, the + date when the buyback was announced, the share count, and the value. + + If the '{TS_FIELD_NAME}' field is not included it is assumed that we + start the backtest with knowledge of all announcements. + """ + __doc__ = __doc__.format( + TS_FIELD_NAME=TS_FIELD_NAME, + SID_FIELD_NAME=SID_FIELD_NAME, + BUYBACK_ANNOUNCEMENT_FIELD_NAME=BUYBACK_ANNOUNCEMENT_FIELD_NAME, + SHARE_COUNT_FIELD_NAME=SHARE_COUNT_FIELD_NAME, + ) + + _expected_fields = frozenset({ + TS_FIELD_NAME, + SID_FIELD_NAME, + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + SHARE_COUNT_FIELD_NAME, + }) + + def __init__(self, + expr, + dataset=ShareBuybackAuthorizations, + loader=ShareBuybackAuthorizationsLoader, + **kwargs): + super( + BlazeShareBuybackAuthorizationsLoader, self + ).__init__(expr, dataset=dataset, loader=loader, **kwargs) diff --git a/zipline/pipeline/loaders/blaze/core.py b/zipline/pipeline/loaders/blaze/core.py index 0fcd52e0..df34c975 100644 --- a/zipline/pipeline/loaders/blaze/core.py +++ b/zipline/pipeline/loaders/blaze/core.py @@ -182,6 +182,7 @@ from zipline.utils.preprocess import preprocess AD_FIELD_NAME = 'asof_date' TS_FIELD_NAME = 'timestamp' SID_FIELD_NAME = 'sid' +ANNOUNCEMENT_FIELD_NAME = 'announcement_date' valid_deltas_node_types = ( bz.expr.Field, bz.expr.ReLabel, diff --git a/zipline/pipeline/loaders/blaze/earnings.py b/zipline/pipeline/loaders/blaze/earnings.py index 9f5137f8..287949da 100644 --- a/zipline/pipeline/loaders/blaze/earnings.py +++ b/zipline/pipeline/loaders/blaze/earnings.py @@ -1,29 +1,14 @@ -from datashape import istabular -import pandas as pd -from toolz import valmap - from .core import ( + ANNOUNCEMENT_FIELD_NAME, TS_FIELD_NAME, - SID_FIELD_NAME, - bind_expression_to_resources, - ffill_query_in_range, + SID_FIELD_NAME ) from zipline.pipeline.data import EarningsCalendar -from zipline.pipeline.loaders.base import PipelineLoader from zipline.pipeline.loaders.earnings import EarningsCalendarLoader -from zipline.pipeline.loaders.utils import ( - check_data_query_args, - normalize_data_query_bounds, - normalize_timestamp_to_query_time, -) -from zipline.utils.input_validation import ensure_timezone, optionally -from zipline.utils.preprocess import preprocess +from .events import BlazeEventsCalendarLoader -ANNOUNCEMENT_FIELD_NAME = 'announcement_date' - - -class BlazeEarningsCalendarLoader(PipelineLoader): +class BlazeEarningsCalendarLoader(BlazeEventsCalendarLoader): """A pipeline loader for the ``EarningsCalendar`` dataset that loads data from a blaze expression. @@ -57,6 +42,7 @@ class BlazeEarningsCalendarLoader(PipelineLoader): If the '{TS_FIELD_NAME}' field is not included it is assumed that we start the backtest with knowledge of all announcements. """ + __doc__ = __doc__.format( TS_FIELD_NAME=TS_FIELD_NAME, SID_FIELD_NAME=SID_FIELD_NAME, @@ -69,75 +55,17 @@ class BlazeEarningsCalendarLoader(PipelineLoader): ANNOUNCEMENT_FIELD_NAME, }) - @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, expr, resources=None, odo_kwargs=None, data_query_time=None, data_query_tz=None, - dataset=EarningsCalendar): - dshape = expr.dshape - - if not istabular(dshape): - raise ValueError( - 'expression dshape must be tabular, got: %s' % dshape, - ) - - expected_fields = self._expected_fields - self._expr = bind_expression_to_resources( - expr[list(expected_fields)], - resources, - ) - self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {} - self._dataset = dataset - check_data_query_args(data_query_time, data_query_tz) - self._data_query_time = data_query_time - self._data_query_tz = data_query_tz - - def load_adjusted_array(self, columns, dates, assets, mask): - data_query_time = self._data_query_time - data_query_tz = self._data_query_tz - lower_dt, upper_dt = normalize_data_query_bounds( - dates[0], - dates[-1], - data_query_time, - data_query_tz, - ) - - raw = ffill_query_in_range( - self._expr, - lower_dt, - upper_dt, - self._odo_kwargs, - ) - sids = raw.loc[:, SID_FIELD_NAME] - raw.drop( - sids[~sids.isin(assets)].index, - inplace=True - ) - if data_query_time is not None: - normalize_timestamp_to_query_time( - raw, - data_query_time, - data_query_tz, - inplace=True, - ts_field=TS_FIELD_NAME, - ) - - gb = raw.groupby(SID_FIELD_NAME) - - def mkseries(idx, raw_loc=raw.loc): - vs = raw_loc[ - idx, [TS_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME] - ].values - return pd.Series( - index=pd.DatetimeIndex(vs[:, 0]), - data=vs[:, 1], - ) - - return EarningsCalendarLoader( - dates, - valmap(mkseries, gb.groups), - dataset=self._dataset, - ).load_adjusted_array(columns, dates, assets, mask) + dataset=EarningsCalendar, + loader=EarningsCalendarLoader, + **kwargs): + super( + BlazeEarningsCalendarLoader, self + ).__init__(expr, dataset=dataset, loader=loader, resources=resources, + odo_kwargs=odo_kwargs, data_query_time=data_query_time, + data_query_tz=data_query_tz, **kwargs) diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py new file mode 100644 index 00000000..c74cad5b --- /dev/null +++ b/zipline/pipeline/loaders/blaze/events.py @@ -0,0 +1,120 @@ +from datashape import istabular + +from .core import ( + TS_FIELD_NAME, + SID_FIELD_NAME, + bind_expression_to_resources, + ffill_query_in_range, +) +from zipline.pipeline.loaders.base import PipelineLoader +from zipline.pipeline.loaders.utils import ( + check_data_query_args, + normalize_data_query_bounds, + normalize_timestamp_to_query_time, +) +from zipline.utils.input_validation import ensure_timezone, optionally +from zipline.utils.preprocess import preprocess + + + +class BlazeEventsCalendarLoader(PipelineLoader): + """An abstract pipeline loader for the events datasets that loads + data from a blaze expression. + + Parameters + ---------- + expr : Expr + The expression representing the data to load. + resources : dict, optional + Mapping from the atomic terms of ``expr`` to actual data resources. + odo_kwargs : dict, optional + Extra keyword arguments to pass to odo when executing the expression. + data_query_time : time, optional + The time to use for the data query cutoff. + data_query_tz : tzinfo or str + The timezeone to use for the data query cutoff. + + Notes + ----- + The expression should have a tabular dshape of:: + + Dim * {{ + {SID_FIELD_NAME}: int64, + {TS_FIELD_NAME}: datetime, + }} + + And other dataset-specific fields, where each row of the table is a + record including the sid to identify the company, the timestamp where we + learned about the announcement, and the date when the earnings will be + announced. + + If the '{TS_FIELD_NAME}' field is not included it is assumed that we + start the backtest with knowledge of all announcements. + """ + + @preprocess(data_query_tz=optionally(ensure_timezone)) + def __init__(self, + expr, + resources=None, + odo_kwargs=None, + data_query_time=None, + data_query_tz=None, + dataset=None, + loader=None): + dshape = expr.dshape + + if not istabular(dshape): + raise ValueError( + 'expression dshape must be tabular, got: %s' % dshape, + ) + + expected_fields = self._expected_fields + self._expr = bind_expression_to_resources( + expr[list(expected_fields)], + resources, + ) + self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {} + self._dataset = dataset + check_data_query_args(data_query_time, data_query_tz) + self._data_query_time = data_query_time + self._data_query_tz = data_query_tz + self._loader = loader + + def load_adjusted_array(self, columns, dates, assets, mask): + data_query_time = self._data_query_time + data_query_tz = self._data_query_tz + lower_dt, upper_dt = normalize_data_query_bounds( + dates[0], + dates[-1], + data_query_time, + data_query_tz, + ) + + raw = ffill_query_in_range( + self._expr, + lower_dt, + upper_dt, + self._odo_kwargs, + ) + sids = raw.loc[:, SID_FIELD_NAME] + raw.drop( + sids[~sids.isin(assets)].index, + inplace=True + ) + if data_query_time is not None: + normalize_timestamp_to_query_time( + raw, + data_query_time, + data_query_tz, + inplace=True, + ts_field=TS_FIELD_NAME, + ) + gb = raw.groupby(SID_FIELD_NAME) + return self._loader( + dates, + self.prepare_data(raw, gb), + dataset=self._dataset, + ).load_adjusted_array(columns, dates, assets, mask) + + def prepare_data(self, raw, gb): + return {sid: raw.loc[group] for sid, group in gb.groups.iteritems()} diff --git a/zipline/pipeline/loaders/buyback_auth.py b/zipline/pipeline/loaders/buyback_auth.py index 8710fd29..02727e53 100644 --- a/zipline/pipeline/loaders/buyback_auth.py +++ b/zipline/pipeline/loaders/buyback_auth.py @@ -1 +1,122 @@ -__author__ = 'mtydykov' +""" +Reference implementation for EarningsCalendar loaders. +""" + +from ..data.buyback_auth import CashBuybackAuthorizations, \ + ShareBuybackAuthorizations +from events import EventsLoader +from zipline.utils.memoize import lazyval + + +BUYBACK_ANNOUNCEMENT_FIELD_NAME = 'buyback_dates' +SHARE_COUNT_FIELD_NAME = 'share_counts' +VALUE_FIELD_NAME = 'values' + + +# TODO: split into 2 datasets - or just think about how to generalize since +# we will often have cases where we have a knowledge date and, optionally, +# a value for that event; having no value (like earnings) is a special case. +class CashBuybackAuthorizationsLoader(EventsLoader): + """ + Reference loader for + :class:`zipline.pipeline.data.earnings.BuybackAuthorizations`. + + Does not currently support adjustments to the dates of known buyback + authorizations. + + events_by_sid: dict[sid -> pd.DataFrame(knowledge date, + event date, value)] + + """ + + def __init__(self, + all_dates, + events_by_sid, + infer_timestamps=False, + dataset=CashBuybackAuthorizations): + super(CashBuybackAuthorizationsLoader, self).__init__( + all_dates, + events_by_sid, + infer_timestamps=infer_timestamps, + dataset=dataset + ) + + + def get_loader(self, column): + """dispatch to the loader for ``column``. + """ + if column is self.dataset.previous_value: + return self.previous_buyback_value_loader + elif column is self.dataset.previous_announcement_date: + return self.previous_event_date_loader + else: + raise ValueError("Don't know how to load column '%s'." % column) + + + @lazyval + def previous_buyback_value_loader(self): + return self._previous_event_value_loader( + self.dataset.previous_value, + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + VALUE_FIELD_NAME + ) + + @lazyval + def previous_event_date_loader(self): + return self._previous_event_date_loader( + self.dataset.previous_announcement_date, + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + ) + + +class ShareBuybackAuthorizationsLoader(EventsLoader): + """ + Reference loader for + :class:`zipline.pipeline.data.earnings.BuybackAuthorizations`. + + Does not currently support adjustments to the dates of known buyback + authorizations. + + events_by_sid: dict[sid -> pd.DataFrame(knowledge date, + event date, value)] + + """ + + def __init__(self, + all_dates, + events_by_sid, + infer_timestamps=False, + dataset=ShareBuybackAuthorizations): + super(ShareBuybackAuthorizationsLoader, self).__init__( + all_dates, + events_by_sid, + infer_timestamps=infer_timestamps, + dataset=dataset + ) + + + def get_loader(self, column): + """dispatch to the loader for ``column``. + """ + if column is self.dataset.previous_share_count: + return self.previous_buyback_share_count_loader + elif column is self.dataset.previous_announcement_date: + return self.previous_event_date_loader + else: + raise ValueError("Don't know how to load column '%s'." % column) + + + @lazyval + def previous_buyback_share_count_loader(self): + return self._previous_event_value_loader( + self.dataset.previous_share_count, + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + SHARE_COUNT_FIELD_NAME + ) + + @lazyval + def previous_event_date_loader(self): + return self._previous_event_date_loader( + self.dataset.previous_announcement_date, + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + ) diff --git a/zipline/pipeline/loaders/earnings.py b/zipline/pipeline/loaders/earnings.py index 9912faff..f93645d0 100644 --- a/zipline/pipeline/loaders/earnings.py +++ b/zipline/pipeline/loaders/earnings.py @@ -1,71 +1,21 @@ """ Reference implementation for EarningsCalendar loaders. """ -from itertools import repeat -import pandas as pd -from six import iteritems -from toolz import merge - -from .base import PipelineLoader -from .frame import DataFrameLoader -from .utils import next_date_frame, previous_date_frame +from events import EventsLoader from ..data.earnings import EarningsCalendar from zipline.utils.memoize import lazyval +ANNOUNCEMENT_FIELD_NAME = "announcement_date" -class EarningsCalendarLoader(PipelineLoader): - """ - Reference loader for - :class:`zipline.pipeline.data.earnings.EarningsCalendar`. - Does not currently support adjustments to the dates of known earnings. - - Parameters - ---------- - all_dates : pd.DatetimeIndex - Index of dates for which we can serve queries. - announcement_dates : dict[int -> pd.Series or pd.DatetimeIndex] - Dict mapping sids to objects representing dates on which earnings - occurred. - - If a dict value is a Series, it's interpreted as a mapping from the - date on which we learned an announcement was coming to the date on - which the announcement was made. - - If a dict value is a DatetimeIndex, it's interpreted as just containing - the dates that announcements were made, and we assume we knew about the - announcement on all prior dates. This mode is only supported if - ``infer_timestamp`` is explicitly passed as a truthy value. - - infer_timestamps : bool, optional - Whether to allow passing ``DatetimeIndex`` values in - ``announcement_dates``. - """ - def __init__(self, - all_dates, - announcement_dates, - infer_timestamps=False, +class EarningsCalendarLoader(EventsLoader): + def __init__(self, all_dates, events_by_sid, infer_timestamps=False, dataset=EarningsCalendar): - self.all_dates = all_dates - self.announcement_dates = announcement_dates = ( - announcement_dates.copy() - ) - dates = self.all_dates.values - for k, v in iteritems(announcement_dates): - if isinstance(v, pd.DatetimeIndex): - if not infer_timestamps: - raise ValueError( - "Got DatetimeIndex of announcement dates for sid %d.\n" - "Pass `infer_timestamps=True` to use the first date in" - " `all_dates` as implicit timestamp." - ) - # If we are passed a DatetimeIndex, we always have - # knowledge of the announcements. - announcement_dates[k] = pd.Series( - v, index=repeat(dates[0], len(v)), - ) - self.dataset = dataset + super(EarningsCalendarLoader, self).__init__(all_dates, + events_by_sid, + infer_timestamps, + dataset=dataset) def get_loader(self, column): """Dispatch to the loader for ``column``. @@ -79,30 +29,12 @@ class EarningsCalendarLoader(PipelineLoader): @lazyval def next_announcement_loader(self): - return DataFrameLoader( - self.dataset.next_announcement, - next_date_frame( - self.all_dates, - self.announcement_dates, - ), - adjustments=None, - ) + return self._next_event_date_loader(self.dataset.next_announcement, + ANNOUNCEMENT_FIELD_NAME) @lazyval def previous_announcement_loader(self): - return DataFrameLoader( + return self._previous_event_date_loader( self.dataset.previous_announcement, - previous_date_frame( - self.all_dates, - self.announcement_dates, - ), - adjustments=None, - ) - - def load_adjusted_array(self, columns, dates, assets, mask): - return merge( - self.get_loader(column).load_adjusted_array( - [column], dates, assets, mask - ) - for column in columns + ANNOUNCEMENT_FIELD_NAME ) diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index 1e6fe7e0..8c6e2a5e 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -1,39 +1,71 @@ -from abc import abstractmethod -from itertools import repeat +from abc import ABCMeta, abstractmethod +import numpy as np import pandas as pd from six import iteritems from toolz import merge from .base import PipelineLoader from .frame import DataFrameLoader -from zipline.utils.memoize import lazyval +from .utils import next_date_frame, previous_date_frame, previous_value + +TS_FIELD_NAME = "timestamp" class EventsLoader(PipelineLoader): + """ + Abstract loader. + + Does not currently support adjustments to the dates of known events. + + Parameters + ---------- + all_dates : pd.DatetimeIndex + Index of dates for which we can serve queries. + events_by_sid : dict[int -> pd.Series] + Dict mapping sids to objects representing dates on which events + occurred. + + If a dict value is a Series, it's interpreted as a mapping from the + date on which we learned an announcement was coming to the date on + which the announcement was made. + + If a dict value is a DatetimeIndex, it's interpreted as just containing + the dates that announcements were made, and we assume we knew about the + announcement on all prior dates. This mode is only supported if + ``infer_timestamp`` is explicitly passed as a truthy value. + + infer_timestamps : bool, optional + Whether to allow passing ``DatetimeIndex`` values in + ``announcement_dates``. + """ + def __init__(self, all_dates, - announcement_dates, + events_by_sid, infer_timestamps=False, dataset=None): self.all_dates = all_dates - self.announcement_dates = ( - announcement_dates.copy() + # TODO: why are we making a copy here? We end up with a copy that we + # modify and then don't use, and an unmodified original which we do use. + self.events_by_sid = ( + events_by_sid.copy() ) dates = self.all_dates.values - for k, v in iteritems(announcement_dates): - if isinstance(v, pd.DatetimeIndex): + + for k, v in iteritems(events_by_sid): + if "timestamp" not in v.columns: if not infer_timestamps: raise ValueError( "Got DatetimeIndex of announcement dates for sid %d.\n" "Pass `infer_timestamps=True` to use the first date in" " `all_dates` as implicit timestamp." ) - # If we are passed a DatetimeIndex, we always have - # knowledge of the announcements. - announcement_dates[k] = pd.Series( - v, index=repeat(dates[0], len(v)), - ) + self.events_by_sid[k] = v = v.copy() + v.index = [dates[0]] * len(v) + else: + self.events_by_sid[k] = v.set_index("timestamp") + self.dataset = dataset @@ -50,13 +82,48 @@ class EventsLoader(PipelineLoader): for column in columns ) - @lazyval - def date_frame_loader(self, col_name, next_or_prev): + def mk_date_series(self, date_field_name): + return {sid: pd.Series(index=event.index, + data=np.array(event[date_field_name])) + for sid, event in iteritems(self.events_by_sid)} + + def _next_event_date_loader(self, next_date_field, event_date_field_name): return DataFrameLoader( - col_name, - next_or_prev( + next_date_field, + next_date_frame( self.all_dates, - self.announcement_dates, + self.mk_date_series(event_date_field_name), ), adjustments=None, ) + + def _previous_event_date_loader(self, prev_date_field, event_date_field_name): + return DataFrameLoader( + prev_date_field, + previous_date_frame( + self.all_dates, + self.mk_date_series(event_date_field_name), + ), + adjustments=None, + ) + + def _previous_event_value_loader(self, + previous_value_field, + event_date_field_name, + value_field_name): + return DataFrameLoader( + previous_value_field, + previous_value( + self.all_dates, + self.events_by_sid, + event_date_field_name, + value_field_name, + previous_value_field.dtype, + # TODO: need to get actual name/method to use to get missing + # value + None + ), + adjustments=None, + ) + + diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 7f9448e7..ff4ee899 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -1,6 +1,7 @@ import datetime import numpy as np +from numpy import NaN import pandas as pd from six import iteritems from six.moves import zip @@ -82,14 +83,14 @@ def previous_date_frame(date_index, events_by_sid): next_date_frame """ sids = list(events_by_sid) - out = np.full((len(date_index), len(sids)), NaTns, dtype='datetime64[ns]') - dn = date_index[-1].asm8 + out = np.full((len(date_index), len(sids)), np_NaT, dtype='datetime64[ns]') + d_n = date_index[-1].asm8 for col_idx, sid in enumerate(sids): # events_by_sid[sid] is Series mapping knowledge_date to actual # event_date. We don't care about the knowledge date for # computing previous earnings. values = events_by_sid[sid].values - values = values[values <= dn] + values = values[values <= d_n] out[date_index.searchsorted(values), col_idx] = values frame = pd.DataFrame(out, index=date_index, columns=sids) @@ -97,6 +98,59 @@ def previous_date_frame(date_index, events_by_sid): return frame +def previous_value(date_index, events_by_sid, event_date_field, value_field, + value_field_dtype, missing_value): + """ + Make a DataFrame representing simulated next earnings date_index. + + Parameters + ---------- + date_index : DatetimeIndex. + The index of the returned DataFrame. + events_by_sid : dict[int -> DatetimeIndex] + Dict mapping sids to a series of dates. Each k:v pair of the series + represents the date we learned of the event mapping to the date the + event will occur. + + Returns + ------- + previous_events: pd.DataFrame + A DataFrame where each column is a security from `events_by_sid` where + the values are the dates of the previous event that occured on the date + of the index. Entries falling before the first date will have `NaT` as + the result in the output. + + See Also + -------- + next_date_frame + """ + sids = list(events_by_sid) + # TODO: generalize; need to use dtype of column and missing value for that + # column; so pass + # in the pipeline column's attributes for these (replace NaN and dtype + # below) + out = np.full( + (len(date_index), len(sids)), + # TODO; replace with missing_value + NaN, + dtype=value_field_dtype + ) + d_n = date_index[-1].asm8 + for col_idx, sid in enumerate(sids): + # events_by_sid[sid] is DataFrame mapping knowledge_date to event + # date and value. We don't care about the knowledge date for computing + # previous values. + df = events_by_sid[sid] + df = df[df[event_date_field] <= d_n] + out[ + date_index.searchsorted(df[event_date_field].values), col_idx + ] = df[value_field] + + frame = pd.DataFrame(out, index=date_index, columns=sids) + frame.ffill(inplace=True) + return frame + + def normalize_data_query_time(dt, time, tz): """Apply the correct time and timezone to a date. From a877fcfdb6b2e3b1642dde4a7e3517e039ddb4c3 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Wed, 10 Feb 2016 15:00:16 -0500 Subject: [PATCH 04/12] TST: finish blaze tests for buyback_auth. DOC: update docs. MAINT: use correct names. BUG: explicitly pass all kwargs. DOC: update docs. STY: fix whitespace. TST: rename vars and update docstring. TST: fix indentation. MAINT: fix comments. --- tests/pipeline/test_buyback_auth.py | 574 ++++++++++-------- tests/pipeline/test_earnings.py | 24 +- zipline/pipeline/data/buyback_auth.py | 9 +- zipline/pipeline/factors/events.py | 14 +- zipline/pipeline/loaders/blaze/__init__.py | 13 +- .../pipeline/loaders/blaze/buyback_auth.py | 56 +- zipline/pipeline/loaders/blaze/earnings.py | 4 + zipline/pipeline/loaders/blaze/events.py | 12 +- zipline/pipeline/loaders/buyback_auth.py | 26 +- zipline/pipeline/loaders/earnings.py | 2 +- zipline/pipeline/loaders/events.py | 41 +- 11 files changed, 453 insertions(+), 322 deletions(-) diff --git a/tests/pipeline/test_buyback_auth.py b/tests/pipeline/test_buyback_auth.py index c76167ae..6b7218c0 100644 --- a/tests/pipeline/test_buyback_auth.py +++ b/tests/pipeline/test_buyback_auth.py @@ -1,14 +1,15 @@ """ Tests for the reference loader for EarningsCalendar. """ +from functools import partial from unittest import TestCase import blaze as bz from blaze.compute.core import swap_resources_into_scope from contextlib2 import ExitStack from nose_parameterized import parameterized -import pandas as pd import numpy as np +import pandas as pd from pandas.util.testing import assert_series_equal from six import iteritems @@ -23,85 +24,90 @@ from zipline.pipeline.factors.events import ( from zipline.pipeline.loaders.buyback_auth import \ CashBuybackAuthorizationsLoader, ShareBuybackAuthorizationsLoader from zipline.pipeline.loaders.blaze import ( + BlazeCashBuybackAuthorizationsLoader, + BlazeShareBuybackAuthorizationsLoader, BUYBACK_ANNOUNCEMENT_FIELD_NAME, - CashBuybackAuthorizationsLoader, SHARE_COUNT_FIELD_NAME, SID_FIELD_NAME, - ShareBuybackAuthorizationsLoader, TS_FIELD_NAME, - VALUE_FIELD_NAME + CASH_FIELD_NAME ) from zipline.utils.numpy_utils import make_datetime64D, np_NaT from zipline.utils.test_utils import ( - make_simple_equity_info, - tmp_asset_finder, gen_calendars, + make_simple_equity_info, num_days_in_range, + tmp_asset_finder, ) sids = A, B, C, D, E = range(5) equity_info = make_simple_equity_info( - sids, - start_date=pd.Timestamp('2013-01-01', tz='UTC'), - end_date=pd.Timestamp('2015-01-01', tz='UTC'), - ) + sids, + start_date=pd.Timestamp('2013-01-01', tz='UTC'), + end_date=pd.Timestamp('2015-01-01', tz='UTC'), +) buyback_authorizations = { - # K1--K2--A1--A2--SC1--SC2--V1--V2. - A: pd.DataFrame({ - "timestamp": pd.to_datetime(['2014-01-05', '2014-01-10']), - BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-15', - '2014-01-20']), - SHARE_COUNT_FIELD_NAME: [1, 15], - VALUE_FIELD_NAME: [10, 20] - }), - # K1--K2--E2--E1. - B: pd.DataFrame({ - "timestamp": pd.to_datetime(['2014-01-05', '2014-01-10']), - BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ - '2014-01-20', '2014-01-15']), - SHARE_COUNT_FIELD_NAME: [7, 13], VALUE_FIELD_NAME: [10, 22] - }), - # K1--E1--K2--E2. - C: pd.DataFrame({ - "timestamp": pd.to_datetime(['2014-01-05', '2014-01-15']), - BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ - '2014-01-10', '2014-01-20']), - SHARE_COUNT_FIELD_NAME: [3, 1], - VALUE_FIELD_NAME: [4, 7] - }), - # K1 == K2. - D: pd.DataFrame({ - "timestamp": pd.to_datetime(['2014-01-05'] * 2), - BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ - '2014-01-10', '2014-01-15']), - SHARE_COUNT_FIELD_NAME: [6, 23], - VALUE_FIELD_NAME: [1, 2] - }), - E: pd.DataFrame( - columns=["timestamp", - BUYBACK_ANNOUNCEMENT_FIELD_NAME, - SHARE_COUNT_FIELD_NAME, - VALUE_FIELD_NAME], - dtype='datetime64[ns]' - ), - } - -param_dates = gen_calendars( - '2014-01-01', - '2014-01-31', - critical_dates=pd.to_datetime([ - '2014-01-05', - '2014-01-10', - '2014-01-15', - '2014-01-20', + # K1--K2--A1--A2--SC1--SC2--V1--V2. + A: pd.DataFrame({ + "timestamp": pd.to_datetime(['2014-01-05', '2014-01-10']), + BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-15', + '2014-01-20']), + SHARE_COUNT_FIELD_NAME: [1, 15], + CASH_FIELD_NAME: [10, 20] + }), + # K1--K2--E2--E1. + B: pd.DataFrame({ + "timestamp": pd.to_datetime(['2014-01-05', '2014-01-10']), + BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ + '2014-01-20', '2014-01-15' ]), - ) + SHARE_COUNT_FIELD_NAME: [7, 13], CASH_FIELD_NAME: [10, 22] + }), + # K1--E1--K2--E2. + C: pd.DataFrame({ + "timestamp": pd.to_datetime(['2014-01-05', '2014-01-15']), + BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ + '2014-01-10', '2014-01-20' + ]), + SHARE_COUNT_FIELD_NAME: [3, 1], + CASH_FIELD_NAME: [4, 7] + }), + # K1 == K2. + D: pd.DataFrame({ + "timestamp": pd.to_datetime(['2014-01-05'] * 2), + BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ + '2014-01-10', '2014-01-15' + ]), + SHARE_COUNT_FIELD_NAME: [6, 23], + CASH_FIELD_NAME: [1, 2] + }), + E: pd.DataFrame( + columns=["timestamp", + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + SHARE_COUNT_FIELD_NAME, + CASH_FIELD_NAME], + dtype='datetime64[ns]' + ), +} + +# Must be a list - can't use generator since this needs to be used more than +# once. +param_dates = list(gen_calendars( + '2014-01-01', + '2014-01-31', + critical_dates=pd.to_datetime([ + '2014-01-05', + '2014-01-10', + '2014-01-15', + '2014-01-20', + ]), +)) -def zip_with_floats(flts, dates): +def zip_with_floats(dates, flts): return pd.Series(flts, index=dates).astype('float') @@ -109,30 +115,15 @@ def num_days_between(dates, start_date, end_date): return num_days_in_range(dates, start_date, end_date) -def zip_with_dates(dts, dates): - return pd.Series(pd.to_datetime(dts), index=dates) +def zip_with_dates(index_dates, dts): + return pd.Series(pd.to_datetime(dts), index=index_dates) -class BuybackAuthLoaderTestCase(TestCase): +class BuybackAuthLoaderCommonTest: """ - Tests for loading the earnings announcement data. + Tests for loading the buyback authorization announcement data. """ - @classmethod - def setUpClass(cls): - cls._cleanup_stack = stack = ExitStack() - - cls.finder = stack.enter_context( - tmp_asset_finder(equities=equity_info), - ) - cls.cols = {} - cls.buyback_authorizations = None - - - @classmethod - def tearDownClass(cls): - cls._cleanup_stack.close() - def loader_args(self, dates): """Construct the base buyback authorizations object to pass to the loader. @@ -149,54 +140,65 @@ class BuybackAuthLoaderTestCase(TestCase): """ return dates, self.buyback_authorizations - def setup(self, dates): + def setup_engine(self, dates): """ - Make a PipelineEngine and expectation functions for the given dates - calendar. + Make a Pipeline Enigne object based on the given dates. + """ + loader = self.loader_type(*self.loader_args(dates)) + return SimplePipelineEngine(lambda _: loader, dates, self.finder) + + def setup_expected_cols(self, dates): + """ + Make expectation functions for the given dates calendar. This exists to make it easy to test our various cases with critical dates missing from the calendar. """ - + num_days_between_for_dates = partial(num_days_between, dates) + zip_with_dates_for_dates = partial(zip_with_dates, dates) _expected_previous_buyback_announcement = pd.DataFrame({ - A: zip_with_dates( - ['NaT'] * num_days_between(dates, None, '2014-01-14') + - ['2014-01-15'] * num_days_between(dates, '2014-01-15', '2014-01-19') + - ['2014-01-20'] * num_days_between(dates, '2014-01-20', None), - dates + A: zip_with_dates_for_dates( + ['NaT'] * num_days_between_for_dates(None, '2014-01-14') + + ['2014-01-15'] * num_days_between_for_dates('2014-01-15', + '2014-01-19') + + ['2014-01-20'] * num_days_between_for_dates('2014-01-20', + None), ), - B: zip_with_dates( - ['NaT'] * num_days_between(dates, None, '2014-01-14') + - ['2014-01-15'] * num_days_between(dates, '2014-01-15', '2014-01-19') + - ['2014-01-20'] * num_days_between(dates, '2014-01-20', None), - dates + B: zip_with_dates_for_dates( + ['NaT'] * num_days_between_for_dates(None, '2014-01-14') + + ['2014-01-15'] * num_days_between_for_dates('2014-01-15', + '2014-01-19') + + ['2014-01-20'] * num_days_between_for_dates('2014-01-20', + None), ), - C: zip_with_dates( - ['NaT'] * num_days_between(dates, None, '2014-01-09') + - ['2014-01-10'] * num_days_between(dates, '2014-01-10', '2014-01-19') + - ['2014-01-20'] * num_days_between(dates, '2014-01-20', None), - dates + C: zip_with_dates_for_dates( + ['NaT'] * num_days_between_for_dates(None, '2014-01-09') + + ['2014-01-10'] * num_days_between_for_dates('2014-01-10', + '2014-01-19') + + ['2014-01-20'] * num_days_between_for_dates('2014-01-20', + None), ), - D: zip_with_dates( - ['NaT'] * num_days_between(dates, None, '2014-01-09') + - ['2014-01-10'] * num_days_between(dates, '2014-01-10', '2014-01-14') + - ['2014-01-15'] * num_days_between(dates, '2014-01-15', None), - dates + D: zip_with_dates_for_dates( + ['NaT'] * num_days_between_for_dates(None, '2014-01-09') + + ['2014-01-10'] * num_days_between_for_dates('2014-01-10', + '2014-01-14') + + ['2014-01-15'] * num_days_between_for_dates('2014-01-15', + None), ), - E: zip_with_dates(['NaT'] * len(dates), dates), + E: zip_with_dates_for_dates(['NaT'] * len(dates)), }, index=dates) _expected_previous_busday_offsets = self._compute_busday_offsets( _expected_previous_buyback_announcement ) - self.cols['previous_buyback_announcement'] = _expected_previous_buyback_announcement + # Common cols for buyback authorization datasets are announcement + # date and days since previous. + self.cols[ + 'previous_buyback_announcement' + ] = _expected_previous_buyback_announcement self.cols['days_since_prev'] = _expected_previous_busday_offsets - loader = self.loader_type(*self.loader_args(dates)) - engine = SimplePipelineEngine(lambda _: loader, dates, self.finder) - return engine - @staticmethod def _compute_busday_offsets(announcement_dates): """ @@ -234,7 +236,8 @@ class BuybackAuthLoaderTestCase(TestCase): ) def _test_compute_buyback_auth(self, dates): - engine = self.setup(dates) + engine = self.setup_engine(dates) + self.setup_expected_cols(dates) pipe = Pipeline( columns=self.pipeline_columns @@ -253,152 +256,247 @@ class BuybackAuthLoaderTestCase(TestCase): sid) -class ShareBuybackAuthLoaderTestCase(BuybackAuthLoaderTestCase): - buyback_authorizations = {sid: df.drop(VALUE_FIELD_NAME, 1) +class CashBuybackAuthLoaderTestCase(TestCase, BuybackAuthLoaderCommonTest): + """ + Test for cash buyback authorizations dataset. + """ + buyback_authorizations = {sid: df.drop(SHARE_COUNT_FIELD_NAME, 1) for sid, df in iteritems(buyback_authorizations)} pipeline_columns = { - 'previous_buyback_share_count': - ShareBuybackAuthorizations.previous_share_count.latest, - 'previous_buyback_announcement': - ShareBuybackAuthorizations.previous_announcement_date.latest, - 'days_since_prev': - BusinessDaysSincePreviousShareBuybackAuth(), - } + 'previous_buyback_cash': + CashBuybackAuthorizations.previous_value.latest, + 'previous_buyback_announcement': + CashBuybackAuthorizations.previous_announcement_date.latest, + 'days_since_prev': + BusinessDaysSincePreviousCashBuybackAuth(), + } @classmethod def setUpClass(cls): - super(ShareBuybackAuthLoaderTestCase, cls).setUpClass() + cls._cleanup_stack = stack = ExitStack() + cls.finder = stack.enter_context( + tmp_asset_finder(equities=equity_info), + ) + cls.cols = {} + cls.buyback_authorizations = buyback_authorizations + cls.loader_type = CashBuybackAuthorizationsLoader + + @classmethod + def tearDownClass(cls): + cls._cleanup_stack.close() + + def setup(self, dates): + zip_with_floats_dates = partial(zip_with_floats, dates) + num_days_between_dates = partial(num_days_between, dates) + super(CashBuybackAuthLoaderTestCase, self).setup_expected_cols(dates) + _expected_previous_cash = pd.DataFrame({ + # TODO if the next knowledge date is 10, why is the range + # until 15? + A: zip_with_floats_dates( + ['NaN'] * num_days_between(dates, None, '2014-01-14') + + [10] * num_days_between_dates('2014-01-15', '2014-01-19') + + [20] * num_days_between_dates('2014-01-20', None) + ), + B: zip_with_floats_dates( + ['NaN'] * num_days_between_dates(None, '2014-01-14') + + [22] * num_days_between_dates('2014-01-15', '2014-01-19') + + [10] * num_days_between_dates('2014-01-20', None) + ), + C: zip_with_floats_dates( + ['NaN'] * num_days_between_dates(None, '2014-01-09') + + [4] * num_days_between_dates('2014-01-10', '2014-01-19') + + [7] * num_days_between_dates('2014-01-20', None) + ), + D: zip_with_floats_dates( + ['NaN'] * num_days_between_dates(None, '2014-01-09') + + [1] * num_days_between_dates('2014-01-10', '2014-01-14') + + [2] * num_days_between_dates('2014-01-15', None) + ), + E: zip_with_floats_dates(['NaN'] * len(dates)), + }, index=dates) + self.cols['previous_buyback_cash'] = _expected_previous_cash + + @parameterized.expand(param_dates) + def test_compute_cash_buyback_auth(self, dates): + self._test_compute_buyback_auth(dates) + + +class ShareBuybackAuthLoaderTestCase(BuybackAuthLoaderCommonTest, TestCase): + """ + Test for share buyback authorizations dataset. + """ + buyback_authorizations = {sid: df.drop(CASH_FIELD_NAME, 1) + for sid, df in iteritems(buyback_authorizations)} + pipeline_columns = { + 'previous_buyback_share_count': + ShareBuybackAuthorizations.previous_share_count.latest, + 'previous_buyback_announcement': + ShareBuybackAuthorizations.previous_announcement_date.latest, + 'days_since_prev': + BusinessDaysSincePreviousShareBuybackAuth(), + } + + @classmethod + def setUpClass(cls): + cls._cleanup_stack = stack = ExitStack() + cls.finder = stack.enter_context( + tmp_asset_finder(equities=equity_info), + ) + cls.cols = {} cls.buyback_authorizations = buyback_authorizations cls.loader_type = ShareBuybackAuthorizationsLoader + @classmethod + def tearDownClass(cls): + cls._cleanup_stack.close() + def setup(self, dates): - engine = super(ShareBuybackAuthLoaderTestCase, self).setup(dates) + zip_with_floats_dates = partial(zip_with_floats, dates) + num_days_between_dates = partial(num_days_between, dates) + super(ShareBuybackAuthLoaderTestCase, self).setup_expected_cols(dates) _expected_previous_buyback_share_count = pd.DataFrame({ - A: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-14') + - [1] * num_days_between(dates, '2014-01-15', '2014-01-19') + - [15] * num_days_between(dates, '2014-01-20', None), dates), - B: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-14') + - [13] * num_days_between(dates, '2014-01-15', '2014-01-19') + - [7] * num_days_between(dates, '2014-01-20', None), dates), - C: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-09') + - [3] * num_days_between(dates, '2014-01-10', '2014-01-19') + - [1] * num_days_between(dates, '2014-01-20', None), dates), - D: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-09') + - [6] * num_days_between(dates, '2014-01-10', '2014-01-14') + - [23] * num_days_between(dates, '2014-01-15', None), dates), - E: zip_with_floats(['NaN'] * len(dates), dates), - }, index=dates) - self.cols['previous_buyback_share_count'] = _expected_previous_buyback_share_count - return engine + A: zip_with_floats_dates( + ['NaN'] * num_days_between_dates(None, '2014-01-14') + + [1] * num_days_between_dates('2014-01-15', '2014-01-19') + + [15] * num_days_between_dates('2014-01-20', None) + ), + B: zip_with_floats_dates( + ['NaN'] * num_days_between_dates(None, '2014-01-14') + + [13] * num_days_between_dates('2014-01-15', '2014-01-19') + + [7] * num_days_between_dates('2014-01-20', None) + ), + C: zip_with_floats_dates( + ['NaN'] * num_days_between_dates(None, '2014-01-09') + + [3] * num_days_between_dates('2014-01-10', '2014-01-19') + + [1] * num_days_between_dates('2014-01-20', None) + ), + D: zip_with_floats_dates( + ['NaN'] * num_days_between_dates(None, '2014-01-09') + + [6] * num_days_between_dates('2014-01-10', '2014-01-14') + + [23] * num_days_between_dates('2014-01-15', None) + ), + E: zip_with_floats_dates(['NaN'] * len(dates)), + }, index=dates) + self.cols[ + 'previous_buyback_share_count' + ] = _expected_previous_buyback_share_count @parameterized.expand(param_dates) - def test_compute_buyback_auth(self, dates): + def test_compute_share_buyback_auth(self, dates): self._test_compute_buyback_auth(dates) -class CashBuybackAuthLoaderTestCase(BuybackAuthLoaderTestCase): - buyback_authorizations = {sid: df.drop(SHARE_COUNT_FIELD_NAME, 1) - for sid, df in iteritems(buyback_authorizations)} - pipeline_columns = { - 'previous_buyback_value': - CashBuybackAuthorizations.previous_value.latest, - 'previous_buyback_announcement': - CashBuybackAuthorizations.previous_announcement_date.latest, - 'days_since_prev': - BusinessDaysSincePreviousCashBuybackAuth(), - } +def mapping_to_df(mapping): + return (bz.Data(pd.concat( + pd.DataFrame({ + BUYBACK_ANNOUNCEMENT_FIELD_NAME: + frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME], + SHARE_COUNT_FIELD_NAME: + frame[SHARE_COUNT_FIELD_NAME], + CASH_FIELD_NAME: + frame[CASH_FIELD_NAME], + TS_FIELD_NAME: + frame[TS_FIELD_NAME], + SID_FIELD_NAME: sid, + }) + for sid, frame in iteritems(mapping) + ).reset_index(drop=True)),) + +class BlazeCashBuybackAuthLoaderTestCase(CashBuybackAuthLoaderTestCase): + """ Test case for loading via blaze. + """ @classmethod def setUpClass(cls): - super(CashBuybackAuthLoaderTestCase, cls).setUpClass() - cls.buyback_authorizations = buyback_authorizations - cls.loader_type = CashBuybackAuthLoaderTestCase + super(BlazeCashBuybackAuthLoaderTestCase, cls).setUpClass() + cls.loader_type = BlazeCashBuybackAuthorizationsLoader - def setup(self, dates): - engine = super(ShareBuybackAuthLoaderTestCase, self).setup(dates) - _expected_previous_value = pd.DataFrame({ - # TODO if the next knowledge date is 10, why is the range - # until 15? - A: zip_with_floats( - ['NaN'] * num_days_between(dates, None, '2014-01-14') + - [10] * num_days_between(dates, '2014-01-15', '2014-01-19') + - [20] * num_days_between(dates, '2014-01-20', None), dates), - B: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-14') + - [22] * num_days_between(dates, '2014-01-15', '2014-01-19') + - [10] * num_days_between(dates, '2014-01-20', None), dates), - C: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-09') + - [4] * num_days_between(dates, '2014-01-10', '2014-01-19') + - [7] * num_days_between(dates, '2014-01-20', None), dates), - D: zip_with_floats(['NaN'] * num_days_between(dates, None, '2014-01-09') + - [1] * num_days_between(dates, '2014-01-10', '2014-01-14') + - [2] * num_days_between(dates, '2014-01-15', None), dates), - E: zip_with_floats(['NaN'] * len(dates), dates), - }, index=dates) - self.cols['previous_buyback_value'] = _expected_previous_value - return engine - - @parameterized.expand(param_dates) - def test_compute_buyback_auth(self, dates): - self._test_compute_buyback_auth(dates) + def loader_args(self, dates): + _, mapping = super( + BlazeCashBuybackAuthLoaderTestCase, + self, + ).loader_args(dates) + return mapping_to_df(mapping) -# class BlazeBuybackAuthLoaderTestCase(BuybackAuthLoaderTestCase): -# loader_type = BlazeBuybackAuthorizationsLoader -# -# def loader_args(self, dates): -# _, mapping = super( -# BlazeBuybackAuthLoaderTestCase, -# self, -# ).loader_args(dates) -# return (bz.Data(pd.concat( -# pd.DataFrame({ -# BUYBACK_ANNOUNCEMENT_FIELD_NAME: -# frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME], -# SHARE_COUNT_FIELD_NAME: frame[SHARE_COUNT_FIELD_NAME], -# VALUE_FIELD_NAME: frame[VALUE_FIELD_NAME], -# TS_FIELD_NAME: frame.index, -# SID_FIELD_NAME: sid, -# }) -# for sid, frame in iteritems(mapping) -# ).reset_index(drop=True)),) -# -# -# class BlazeEarningsCalendarLoaderNotInteractiveTestCase( -# BlazeBuybackAuthLoaderTestCase): -# """Test case for passing a non-interactive symbol and a dict of resources. -# """ -# def loader_args(self, dates): -# (bound_expr,) = super( -# BlazeEarningsCalendarLoaderNotInteractiveTestCase, -# self, -# ).loader_args(dates) -# return swap_resources_into_scope(bound_expr, {}) -# -# -# class BuybackAuthLoaderInferTimestampTestCase(TestCase): -# def test_infer_timestamp(self): -# dtx = pd.date_range('2014-01-01', '2014-01-10') -# events_by_sid = { -# 0: pd.DataFrame({BUYBACK_ANNOUNCEMENT_FIELD_NAME: dtx}), -# 1: pd.DataFrame( -# {BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.Series(dtx, dtx)}, -# index=dtx -# ) -# } -# loader = BuybackAuthorizationsLoader( -# dtx, -# events_by_sid, -# infer_timestamps=True, -# ) -# self.assertEqual( -# loader.events_by_sid.keys(), -# events_by_sid.keys(), -# ) -# assert_series_equal( -# loader.events_by_sid[0][BUYBACK_ANNOUNCEMENT_FIELD_NAME], -# pd.Series(index=[dtx[0]] * 10, data=dtx), -# ) -# assert_series_equal( -# loader.events_by_sid[1][BUYBACK_ANNOUNCEMENT_FIELD_NAME], -# events_by_sid[1][BUYBACK_ANNOUNCEMENT_FIELD_NAME], -# ) +class BlazeShareBuybackAuthLoaderTestCase(ShareBuybackAuthLoaderTestCase): + """ Test case for loading via blaze. + """ + @classmethod + def setUpClass(cls): + super(BlazeShareBuybackAuthLoaderTestCase, cls).setUpClass() + cls.loader_type = BlazeShareBuybackAuthorizationsLoader + + def loader_args(self, dates): + _, mapping = super( + BlazeShareBuybackAuthLoaderTestCase, + self, + ).loader_args(dates) + return mapping_to_df(mapping) + + +class BlazeShareBuybackAuthLoaderNotInteractiveTestCase( + BlazeShareBuybackAuthLoaderTestCase): + """Test case for passing a non-interactive symbol and a dict of resources. + """ + def loader_args(self, dates): + (bound_expr,) = super( + BlazeShareBuybackAuthLoaderNotInteractiveTestCase, + self, + ).loader_args(dates) + return swap_resources_into_scope(bound_expr, {}) + + +class BlazeCashBuybackAuthLoaderNotInteractiveTestCase( + BlazeCashBuybackAuthLoaderTestCase): + """Test case for passing a non-interactive symbol and a dict of resources. + """ + def loader_args(self, dates): + (bound_expr,) = super( + BlazeCashBuybackAuthLoaderNotInteractiveTestCase, + self, + ).loader_args(dates) + return swap_resources_into_scope(bound_expr, {}) + + +class BuybackAuthLoaderInferTimestampTestCase(TestCase): + @parameterized.expand([[CashBuybackAuthorizationsLoader], + [ShareBuybackAuthorizationsLoader]]) + def test_infer_timestamp(self, loader): + dtx = pd.date_range('2014-01-01', '2014-01-10') + events_by_sid = { + # No timestamp column - should index by first given date + 0: pd.DataFrame({BUYBACK_ANNOUNCEMENT_FIELD_NAME: dtx}), + # timestamp column exists - should index by it + 1: pd.DataFrame( + {BUYBACK_ANNOUNCEMENT_FIELD_NAME: dtx, + TS_FIELD_NAME: dtx} + ) + } + loader = loader( + dtx, + events_by_sid, + infer_timestamps=True, + ) + self.assertEqual( + loader.events_by_sid.keys(), + events_by_sid.keys(), + ) + + # Check that index by first given date has been added + assert_series_equal( + loader.events_by_sid[0][BUYBACK_ANNOUNCEMENT_FIELD_NAME], + pd.Series(index=[dtx[0]] * 10, + data=dtx, + name=BUYBACK_ANNOUNCEMENT_FIELD_NAME), + ) + + # Check that timestamp column was turned into index + modified_events_by_sid_date_col = pd.Series(data=np.array( + events_by_sid[1][BUYBACK_ANNOUNCEMENT_FIELD_NAME]), + index=events_by_sid[1][TS_FIELD_NAME], + name=BUYBACK_ANNOUNCEMENT_FIELD_NAME) + assert_series_equal( + loader.events_by_sid[1][BUYBACK_ANNOUNCEMENT_FIELD_NAME], + modified_events_by_sid_date_col, + ) diff --git a/tests/pipeline/test_earnings.py b/tests/pipeline/test_earnings.py index 4f78fb68..aecc2c88 100644 --- a/tests/pipeline/test_earnings.py +++ b/tests/pipeline/test_earnings.py @@ -7,8 +7,8 @@ import blaze as bz from blaze.compute.core import swap_resources_into_scope from contextlib2 import ExitStack from nose_parameterized import parameterized -import pandas as pd import numpy as np +import pandas as pd from pandas.util.testing import assert_series_equal from six import iteritems @@ -16,8 +16,8 @@ from zipline.pipeline import Pipeline from zipline.pipeline.data import EarningsCalendar from zipline.pipeline.engine import SimplePipelineEngine from zipline.pipeline.factors.events import ( - BusinessDaysUntilNextEarnings, BusinessDaysSincePreviousEarnings, + BusinessDaysUntilNextEarnings, ) from zipline.pipeline.loaders.earnings import EarningsCalendarLoader from zipline.pipeline.loaders.blaze import ( @@ -28,11 +28,10 @@ from zipline.pipeline.loaders.blaze import ( ) from zipline.utils.numpy_utils import make_datetime64D, NaTD from zipline.utils.test_utils import ( - make_simple_equity_info, - tmp_asset_finder, gen_calendars, - to_series, + make_simple_equity_info, num_days_in_range, + tmp_asset_finder, ) @@ -121,8 +120,7 @@ class EarningsCalendarLoaderTestCase(TestCase): def zip_with_dates(dts): return pd.Series(pd.to_datetime(dts), index=dates) - # TODO: tests will break because I now need mappings of sid -> - # dataframe instead of sid -> series + _expected_next_announce = pd.DataFrame({ A: zip_with_dates( ['NaT'] * num_days_between(None, '2014-01-04') + @@ -374,7 +372,9 @@ class EarningsCalendarLoaderInferTimestampTestCase(TestCase): dtx = pd.date_range('2014-01-01', '2014-01-10') announcement_dates = { 0: pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx}), - 1: pd.DataFrame({TS_FIELD_NAME: dtx, ANNOUNCEMENT_FIELD_NAME: dtx}), + 1: pd.DataFrame( + {TS_FIELD_NAME: dtx, ANNOUNCEMENT_FIELD_NAME: dtx} + ), } loader = EarningsCalendarLoader( dtx, @@ -387,13 +387,15 @@ class EarningsCalendarLoaderInferTimestampTestCase(TestCase): ) assert_series_equal( pd.Series(loader.events_by_sid[0][ANNOUNCEMENT_FIELD_NAME]), - pd.Series(index=[dtx[0]] * 10, data=dtx, + pd.Series(index=[dtx[0]] * 10, + data=dtx, name=ANNOUNCEMENT_FIELD_NAME), ) assert_series_equal( pd.Series(loader.events_by_sid[1][ANNOUNCEMENT_FIELD_NAME]), pd.Series(index=announcement_dates[1][TS_FIELD_NAME], - data=np.array(announcement_dates[1][ - ANNOUNCEMENT_FIELD_NAME]), + data=np.array( + announcement_dates[1][ANNOUNCEMENT_FIELD_NAME] + ), name=ANNOUNCEMENT_FIELD_NAME) ) diff --git a/zipline/pipeline/data/buyback_auth.py b/zipline/pipeline/data/buyback_auth.py index 7c1cf952..8541d2a8 100644 --- a/zipline/pipeline/data/buyback_auth.py +++ b/zipline/pipeline/data/buyback_auth.py @@ -1,5 +1,5 @@ """ -Dataset representing dates of upcoming earnings. +Datasets representing dates of recently announced buyback authorizations. """ from zipline.utils.numpy_utils import datetime64ns_dtype, float64_dtype @@ -8,12 +8,17 @@ from .dataset import Column, DataSet class CashBuybackAuthorizations(DataSet): """ - Dataset representing dates of recently announced buyback authorization. + Dataset representing dates of recently announced cash buyback + authorizations. """ previous_value = Column(float64_dtype) previous_announcement_date = Column(datetime64ns_dtype) class ShareBuybackAuthorizations(DataSet): + """ + Dataset representing dates of recently announced share buyback + authorizations. + """ previous_share_count = Column(float64_dtype) previous_announcement_date = Column(datetime64ns_dtype) diff --git a/zipline/pipeline/factors/events.py b/zipline/pipeline/factors/events.py index 481e0e8a..127aae29 100644 --- a/zipline/pipeline/factors/events.py +++ b/zipline/pipeline/factors/events.py @@ -27,9 +27,9 @@ class BusinessDaysSincePreviousEvents(Factor): This doesn't use trading days for symmetry with BusinessDaysUntilNextEarnings. - Assets which announced or will announce the event today will produce a value - of 0.0. Assets that announced the event on the previous business day will - produce a value of 1.0. + Assets which announced or will announce the event today will produce a + value of 0.0. Assets that announced the event on the previous business + day will produce a value of 1.0. Assets for which the event date is `NaT` will produce a value of `NaN`. """ @@ -108,14 +108,16 @@ class BusinessDaysSincePreviousEarnings(BusinessDaysSincePreviousEvents): inputs = [EarningsCalendar.previous_announcement] -class BusinessDaysSincePreviousCashBuybackAuth(BusinessDaysSincePreviousEvents): +class BusinessDaysSincePreviousCashBuybackAuth( + BusinessDaysSincePreviousEvents +): """ Factor returning the number of **business days** (not trading days!) since the most recent cash buyback authorization for each asset. See Also -------- - zipline.pipeline.factors.BusinessDaysUntilNextEarnings + zipline.pipeline.factors.BusinessDaysSincePreviousCashBuybackAuth """ inputs = [CashBuybackAuthorizations.previous_announcement_date] @@ -130,6 +132,6 @@ class BusinessDaysSincePreviousShareBuybackAuth( See Also -------- - zipline.pipeline.factors.BusinessDaysUntilNextEarnings + zipline.pipeline.factors.BusinessDaysSincePreviousShareBuybackAuth """ inputs = [ShareBuybackAuthorizations.previous_announcement_date] diff --git a/zipline/pipeline/loaders/blaze/__init__.py b/zipline/pipeline/loaders/blaze/__init__.py index 9702d5ea..301cbc7d 100644 --- a/zipline/pipeline/loaders/blaze/__init__.py +++ b/zipline/pipeline/loaders/blaze/__init__.py @@ -1,6 +1,7 @@ + from .buyback_auth import ( - CashBuybackAuthorizationsLoader, - ShareBuybackAuthorizationsLoader + BlazeCashBuybackAuthorizationsLoader, + BlazeShareBuybackAuthorizationsLoader ) from .core import ( AD_FIELD_NAME, @@ -14,7 +15,7 @@ from .core import ( from .buyback_auth import ( BUYBACK_ANNOUNCEMENT_FIELD_NAME, SHARE_COUNT_FIELD_NAME, - VALUE_FIELD_NAME + CASH_FIELD_NAME ) from .earnings import ( ANNOUNCEMENT_FIELD_NAME, @@ -24,16 +25,16 @@ from .earnings import ( __all__ = ( 'AD_FIELD_NAME', 'ANNOUNCEMENT_FIELD_NAME', + 'BlazeCashBuybackAuthorizationsLoader', 'BlazeEarningsCalendarLoader', 'BlazeLoader', + 'BlazeShareBuybackAuthorizationsLoader', 'BUYBACK_ANNOUNCEMENT_FIELD_NAME', - 'CashBuybackAuthorizationsLoader', 'NoDeltasWarning', 'SHARE_COUNT_FIELD_NAME', 'SID_FIELD_NAME', - 'ShareBuybackAuthorizationsLoader', 'TS_FIELD_NAME', - 'VALUE_FIELD_NAME', + 'CASH_FIELD_NAME', 'from_blaze', 'global_loader', ) diff --git a/zipline/pipeline/loaders/blaze/buyback_auth.py b/zipline/pipeline/loaders/blaze/buyback_auth.py index b98a03aa..e92c3b9c 100644 --- a/zipline/pipeline/loaders/blaze/buyback_auth.py +++ b/zipline/pipeline/loaders/blaze/buyback_auth.py @@ -5,19 +5,17 @@ from .core import ( from zipline.pipeline.data import (CashBuybackAuthorizations, ShareBuybackAuthorizations) from zipline.pipeline.loaders.buyback_auth import ( + BUYBACK_ANNOUNCEMENT_FIELD_NAME, CashBuybackAuthorizationsLoader, - ShareBuybackAuthorizationsLoader + CASH_FIELD_NAME, + ShareBuybackAuthorizationsLoader, + SHARE_COUNT_FIELD_NAME ) from .events import BlazeEventsCalendarLoader -BUYBACK_ANNOUNCEMENT_FIELD_NAME = 'buyback_dates' -SHARE_COUNT_FIELD_NAME = 'share_counts' -VALUE_FIELD_NAME = 'values' - - class BlazeCashBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): - """A pipeline loader for the ``BuybackAuth`` dataset that loads + """A pipeline loader for the ``CashBuybackAuthorizations`` dataset that loads data from a blaze expression. Parameters @@ -32,6 +30,10 @@ class BlazeCashBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezeone to use for the data query cutoff. + dataset: DataSet + The DataSet object for which this loader loads data. + loader: EventsLoader + The reference loader to use for this dataset. Notes ----- @@ -41,12 +43,12 @@ class BlazeCashBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): {SID_FIELD_NAME}: int64, {TS_FIELD_NAME}: datetime, {BUYBACK_ANNOUNCEMENT_FIELD_NAME}: ?datetime, - {VALUE_FIELD_NAME}: ?float64 + {CASH_FIELD_NAME}: ?float64 }} Where each row of the table is a record including the sid to identify the company, the timestamp where we learned about the announcement, the - date when the buyback was announced, the share count, and the value. + date when the buyback was announced, the share count, and the cash amount. If the '{TS_FIELD_NAME}' field is not included it is assumed that we start the backtest with knowledge of all announcements. @@ -55,28 +57,39 @@ class BlazeCashBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): TS_FIELD_NAME=TS_FIELD_NAME, SID_FIELD_NAME=SID_FIELD_NAME, BUYBACK_ANNOUNCEMENT_FIELD_NAME=BUYBACK_ANNOUNCEMENT_FIELD_NAME, - VALUE_FIELD_NAME=VALUE_FIELD_NAME + CASH_FIELD_NAME=CASH_FIELD_NAME ) _expected_fields = frozenset({ TS_FIELD_NAME, SID_FIELD_NAME, BUYBACK_ANNOUNCEMENT_FIELD_NAME, - VALUE_FIELD_NAME + CASH_FIELD_NAME }) def __init__(self, expr, + resources=None, + odo_kwargs=None, + data_query_time=None, + data_query_tz=None, dataset=CashBuybackAuthorizations, loader=CashBuybackAuthorizationsLoader, **kwargs): super( BlazeCashBuybackAuthorizationsLoader, self - ).__init__(expr, dataset=dataset, loader=loader, **kwargs) + ).__init__(expr, + resources=resources, + odo_kwargs=odo_kwargs, + data_query_time=data_query_time, + data_query_tz=data_query_tz, + dataset=dataset, + loader=loader, + **kwargs) class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): - """A pipeline loader for the ``BuybackAuth`` dataset that loads + """A pipeline loader for the ``ShareBuybackAuthorizations`` dataset that loads data from a blaze expression. Parameters @@ -91,6 +104,10 @@ class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezeone to use for the data query cutoff. + dataset: DataSet + The DataSet object for which this loader loads data. + loader: EventsLoader + The reference loader to use for this dataset. Notes ----- @@ -126,9 +143,20 @@ class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): def __init__(self, expr, + resources=None, + odo_kwargs=None, + data_query_time=None, + data_query_tz=None, dataset=ShareBuybackAuthorizations, loader=ShareBuybackAuthorizationsLoader, **kwargs): super( BlazeShareBuybackAuthorizationsLoader, self - ).__init__(expr, dataset=dataset, loader=loader, **kwargs) + ).__init__(expr, + resources=resources, + odo_kwargs=odo_kwargs, + data_query_time=data_query_time, + data_query_tz=data_query_tz, + dataset=dataset, + loader=loader, + **kwargs) diff --git a/zipline/pipeline/loaders/blaze/earnings.py b/zipline/pipeline/loaders/blaze/earnings.py index 287949da..a08c2fd8 100644 --- a/zipline/pipeline/loaders/blaze/earnings.py +++ b/zipline/pipeline/loaders/blaze/earnings.py @@ -24,6 +24,10 @@ class BlazeEarningsCalendarLoader(BlazeEventsCalendarLoader): The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezeone to use for the data query cutoff. + dataset: DataSet + The DataSet object for which this loader loads data. + loader: EventsLoader + The reference loader to use for this dataset. Notes ----- diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index c74cad5b..8196377c 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -16,7 +16,6 @@ from zipline.utils.input_validation import ensure_timezone, optionally from zipline.utils.preprocess import preprocess - class BlazeEventsCalendarLoader(PipelineLoader): """An abstract pipeline loader for the events datasets that loads data from a blaze expression. @@ -33,7 +32,10 @@ class BlazeEventsCalendarLoader(PipelineLoader): The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezeone to use for the data query cutoff. - + dataset : DataSet + The DataSet object for which this loader loads data. + concrete_loader : + The concrete loader to use for loading data into specified columns. Notes ----- The expression should have a tabular dshape of:: @@ -60,7 +62,7 @@ class BlazeEventsCalendarLoader(PipelineLoader): data_query_time=None, data_query_tz=None, dataset=None, - loader=None): + concrete_loader=None): dshape = expr.dshape if not istabular(dshape): @@ -78,7 +80,7 @@ class BlazeEventsCalendarLoader(PipelineLoader): check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz - self._loader = loader + self._concrete_loader = concrete_loader def load_adjusted_array(self, columns, dates, assets, mask): data_query_time = self._data_query_time @@ -110,7 +112,7 @@ class BlazeEventsCalendarLoader(PipelineLoader): ts_field=TS_FIELD_NAME, ) gb = raw.groupby(SID_FIELD_NAME) - return self._loader( + return self._concrete_loader( dates, self.prepare_data(raw, gb), dataset=self._dataset, diff --git a/zipline/pipeline/loaders/buyback_auth.py b/zipline/pipeline/loaders/buyback_auth.py index 02727e53..038a89c0 100644 --- a/zipline/pipeline/loaders/buyback_auth.py +++ b/zipline/pipeline/loaders/buyback_auth.py @@ -2,30 +2,26 @@ Reference implementation for EarningsCalendar loaders. """ -from ..data.buyback_auth import CashBuybackAuthorizations, \ +from ..data.buyback_auth import ( + CashBuybackAuthorizations, ShareBuybackAuthorizations +) from events import EventsLoader from zipline.utils.memoize import lazyval BUYBACK_ANNOUNCEMENT_FIELD_NAME = 'buyback_dates' SHARE_COUNT_FIELD_NAME = 'share_counts' -VALUE_FIELD_NAME = 'values' +CASH_FIELD_NAME = 'cash' -# TODO: split into 2 datasets - or just think about how to generalize since -# we will often have cases where we have a knowledge date and, optionally, -# a value for that event; having no value (like earnings) is a special case. class CashBuybackAuthorizationsLoader(EventsLoader): """ Reference loader for - :class:`zipline.pipeline.data.earnings.BuybackAuthorizations`. - - Does not currently support adjustments to the dates of known buyback - authorizations. + :class:`zipline.pipeline.data.earnings.CashBuybackAuthorizations`. events_by_sid: dict[sid -> pd.DataFrame(knowledge date, - event date, value)] + event date, cash value)] """ @@ -41,7 +37,6 @@ class CashBuybackAuthorizationsLoader(EventsLoader): dataset=dataset ) - def get_loader(self, column): """dispatch to the loader for ``column``. """ @@ -52,13 +47,12 @@ class CashBuybackAuthorizationsLoader(EventsLoader): else: raise ValueError("Don't know how to load column '%s'." % column) - @lazyval def previous_buyback_value_loader(self): return self._previous_event_value_loader( self.dataset.previous_value, BUYBACK_ANNOUNCEMENT_FIELD_NAME, - VALUE_FIELD_NAME + CASH_FIELD_NAME ) @lazyval @@ -72,13 +66,13 @@ class CashBuybackAuthorizationsLoader(EventsLoader): class ShareBuybackAuthorizationsLoader(EventsLoader): """ Reference loader for - :class:`zipline.pipeline.data.earnings.BuybackAuthorizations`. + :class:`zipline.pipeline.data.earnings.ShareBuybackAuthorizations`. Does not currently support adjustments to the dates of known buyback authorizations. events_by_sid: dict[sid -> pd.DataFrame(knowledge date, - event date, value)] + event date, share value)] """ @@ -94,7 +88,6 @@ class ShareBuybackAuthorizationsLoader(EventsLoader): dataset=dataset ) - def get_loader(self, column): """dispatch to the loader for ``column``. """ @@ -105,7 +98,6 @@ class ShareBuybackAuthorizationsLoader(EventsLoader): else: raise ValueError("Don't know how to load column '%s'." % column) - @lazyval def previous_buyback_share_count_loader(self): return self._previous_event_value_loader( diff --git a/zipline/pipeline/loaders/earnings.py b/zipline/pipeline/loaders/earnings.py index f93645d0..e2030430 100644 --- a/zipline/pipeline/loaders/earnings.py +++ b/zipline/pipeline/loaders/earnings.py @@ -2,8 +2,8 @@ Reference implementation for EarningsCalendar loaders. """ -from events import EventsLoader from ..data.earnings import EarningsCalendar +from events import EventsLoader from zipline.utils.memoize import lazyval ANNOUNCEMENT_FIELD_NAME = "announcement_date" diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index 8c6e2a5e..9920a729 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -1,4 +1,4 @@ -from abc import ABCMeta, abstractmethod +from abc import abstractmethod import numpy as np import pandas as pd @@ -22,22 +22,19 @@ class EventsLoader(PipelineLoader): ---------- all_dates : pd.DatetimeIndex Index of dates for which we can serve queries. - events_by_sid : dict[int -> pd.Series] - Dict mapping sids to objects representing dates on which events - occurred. + events_by_sid : dict[int -> pd.DataFrame] + Dict mapping sids to DataFrames representing dates on which events + occurred along with other associated values. - If a dict value is a Series, it's interpreted as a mapping from the - date on which we learned an announcement was coming to the date on - which the announcement was made. + If the DataFrames contain a "timestamp" column, that column is + interpreted as the date on which we learned about the event. - If a dict value is a DatetimeIndex, it's interpreted as just containing - the dates that announcements were made, and we assume we knew about the - announcement on all prior dates. This mode is only supported if - ``infer_timestamp`` is explicitly passed as a truthy value. + If the DataFrames do not contain a "timestamp" column, we assume we + knew about the event on all prior dates. This mode is only supported + if ``infer_timestamp`` is explicitly passed as a truthy value. infer_timestamps : bool, optional - Whether to allow passing ``DatetimeIndex`` values in - ``announcement_dates``. + Whether to allow omitting the "timestamp" column. """ def __init__(self, @@ -46,8 +43,9 @@ class EventsLoader(PipelineLoader): infer_timestamps=False, dataset=None): self.all_dates = all_dates - # TODO: why are we making a copy here? We end up with a copy that we - # modify and then don't use, and an unmodified original which we do use. + + # Do not modify the original in place, since it may be used for other + # purposes. self.events_by_sid = ( events_by_sid.copy() ) @@ -57,7 +55,8 @@ class EventsLoader(PipelineLoader): if "timestamp" not in v.columns: if not infer_timestamps: raise ValueError( - "Got DatetimeIndex of announcement dates for sid %d.\n" + "Got DataFrame without a 'timestamp' column for " + "sid %d.\n" "Pass `infer_timestamps=True` to use the first date in" " `all_dates` as implicit timestamp." ) @@ -68,11 +67,9 @@ class EventsLoader(PipelineLoader): self.dataset = dataset - @abstractmethod def get_loader(self): - raise NotImplementedError("EventsLoader must implement 'get_loader'.") - + raise NotImplementedError("Must implement 'get_loader'.") def load_adjusted_array(self, columns, dates, assets, mask): return merge( @@ -97,7 +94,9 @@ class EventsLoader(PipelineLoader): adjustments=None, ) - def _previous_event_date_loader(self, prev_date_field, event_date_field_name): + def _previous_event_date_loader(self, + prev_date_field, + event_date_field_name): return DataFrameLoader( prev_date_field, previous_date_frame( @@ -125,5 +124,3 @@ class EventsLoader(PipelineLoader): ), adjustments=None, ) - - From 7100e60474c13f754d676dec8e108f7275f020ed Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Tue, 16 Feb 2016 17:57:01 -0500 Subject: [PATCH 05/12] MAINT: accept more data structures, verify, and select loader dynamically. MAINT: add fields based on changes to events loader. MAINT: modify based on expectations of events loader. MAINT: modify args. TST: clean up and clarify df access. TST: fix bugs in test that didn't properly split datasets' data. MAINT: fix merge error. --- tests/pipeline/test_buyback_auth.py | 81 ++++++++++--------- tests/pipeline/test_earnings.py | 8 +- tests/pipeline/test_events.py | 1 + zipline/pipeline/factors/events.py | 2 +- .../pipeline/loaders/blaze/buyback_auth.py | 12 +-- zipline/pipeline/loaders/blaze/earnings.py | 9 ++- zipline/pipeline/loaders/buyback_auth.py | 44 ++++------ zipline/pipeline/loaders/earnings.py | 24 +++--- zipline/pipeline/loaders/events.py | 58 ++++++++++--- zipline/pipeline/loaders/utils.py | 4 +- 10 files changed, 134 insertions(+), 109 deletions(-) create mode 100644 tests/pipeline/test_events.py diff --git a/tests/pipeline/test_buyback_auth.py b/tests/pipeline/test_buyback_auth.py index 6b7218c0..7bc9d6c8 100644 --- a/tests/pipeline/test_buyback_auth.py +++ b/tests/pipeline/test_buyback_auth.py @@ -32,7 +32,7 @@ from zipline.pipeline.loaders.blaze import ( TS_FIELD_NAME, CASH_FIELD_NAME ) -from zipline.utils.numpy_utils import make_datetime64D, np_NaT +from zipline.utils.numpy_utils import make_datetime64D, NaTD from zipline.utils.test_utils import ( gen_calendars, make_simple_equity_info, @@ -217,7 +217,7 @@ class BuybackAuthLoaderCommonTest: # Set NaTs to 0 temporarily because busday_count doesn't support NaT. # We fill these entries with NaNs later. - whereNaT = raw_announce_dates == np_NaT + whereNaT = raw_announce_dates == NaTD raw_announce_dates[whereNaT] = make_datetime64D(0) # The abs call here makes it so that we can use this function to @@ -260,8 +260,6 @@ class CashBuybackAuthLoaderTestCase(TestCase, BuybackAuthLoaderCommonTest): """ Test for cash buyback authorizations dataset. """ - buyback_authorizations = {sid: df.drop(SHARE_COUNT_FIELD_NAME, 1) - for sid, df in iteritems(buyback_authorizations)} pipeline_columns = { 'previous_buyback_cash': CashBuybackAuthorizations.previous_value.latest, @@ -278,7 +276,9 @@ class CashBuybackAuthLoaderTestCase(TestCase, BuybackAuthLoaderCommonTest): tmp_asset_finder(equities=equity_info), ) cls.cols = {} - cls.buyback_authorizations = buyback_authorizations + cls.buyback_authorizations = {sid: df.drop(SHARE_COUNT_FIELD_NAME, 1) + for sid, df in + iteritems(buyback_authorizations)} cls.loader_type = CashBuybackAuthorizationsLoader @classmethod @@ -325,8 +325,6 @@ class ShareBuybackAuthLoaderTestCase(BuybackAuthLoaderCommonTest, TestCase): """ Test for share buyback authorizations dataset. """ - buyback_authorizations = {sid: df.drop(CASH_FIELD_NAME, 1) - for sid, df in iteritems(buyback_authorizations)} pipeline_columns = { 'previous_buyback_share_count': ShareBuybackAuthorizations.previous_share_count.latest, @@ -343,7 +341,9 @@ class ShareBuybackAuthLoaderTestCase(BuybackAuthLoaderCommonTest, TestCase): tmp_asset_finder(equities=equity_info), ) cls.cols = {} - cls.buyback_authorizations = buyback_authorizations + cls.buyback_authorizations = {sid: df.drop(CASH_FIELD_NAME, 1) + for sid, df in + iteritems(buyback_authorizations)} cls.loader_type = ShareBuybackAuthorizationsLoader @classmethod @@ -386,23 +386,6 @@ class ShareBuybackAuthLoaderTestCase(BuybackAuthLoaderCommonTest, TestCase): self._test_compute_buyback_auth(dates) -def mapping_to_df(mapping): - return (bz.Data(pd.concat( - pd.DataFrame({ - BUYBACK_ANNOUNCEMENT_FIELD_NAME: - frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME], - SHARE_COUNT_FIELD_NAME: - frame[SHARE_COUNT_FIELD_NAME], - CASH_FIELD_NAME: - frame[CASH_FIELD_NAME], - TS_FIELD_NAME: - frame[TS_FIELD_NAME], - SID_FIELD_NAME: sid, - }) - for sid, frame in iteritems(mapping) - ).reset_index(drop=True)),) - - class BlazeCashBuybackAuthLoaderTestCase(CashBuybackAuthLoaderTestCase): """ Test case for loading via blaze. """ @@ -416,7 +399,18 @@ class BlazeCashBuybackAuthLoaderTestCase(CashBuybackAuthLoaderTestCase): BlazeCashBuybackAuthLoaderTestCase, self, ).loader_args(dates) - return mapping_to_df(mapping) + return (bz.Data(pd.concat( + pd.DataFrame({ + BUYBACK_ANNOUNCEMENT_FIELD_NAME: + frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME], + CASH_FIELD_NAME: + frame[CASH_FIELD_NAME], + TS_FIELD_NAME: + frame[TS_FIELD_NAME], + SID_FIELD_NAME: sid, + }) + for sid, frame in iteritems(mapping) + ).reset_index(drop=True)),) class BlazeShareBuybackAuthLoaderTestCase(ShareBuybackAuthLoaderTestCase): @@ -432,7 +426,18 @@ class BlazeShareBuybackAuthLoaderTestCase(ShareBuybackAuthLoaderTestCase): BlazeShareBuybackAuthLoaderTestCase, self, ).loader_args(dates) - return mapping_to_df(mapping) + return (bz.Data(pd.concat( + pd.DataFrame({ + BUYBACK_ANNOUNCEMENT_FIELD_NAME: + frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME], + SHARE_COUNT_FIELD_NAME: + frame[SHARE_COUNT_FIELD_NAME], + TS_FIELD_NAME: + frame[TS_FIELD_NAME], + SID_FIELD_NAME: sid, + }) + for sid, frame in iteritems(mapping) + ).reset_index(drop=True)),) class BlazeShareBuybackAuthLoaderNotInteractiveTestCase( @@ -458,20 +463,24 @@ class BlazeCashBuybackAuthLoaderNotInteractiveTestCase( ).loader_args(dates) return swap_resources_into_scope(bound_expr, {}) +dtx = pd.date_range('2014-01-01', '2014-01-10') + class BuybackAuthLoaderInferTimestampTestCase(TestCase): - @parameterized.expand([[CashBuybackAuthorizationsLoader], - [ShareBuybackAuthorizationsLoader]]) - def test_infer_timestamp(self, loader): - dtx = pd.date_range('2014-01-01', '2014-01-10') + # 'fields' needs to match expected fields for the given loader to + # satisfy column check in constructor. + @parameterized.expand([[CashBuybackAuthorizationsLoader, + {BUYBACK_ANNOUNCEMENT_FIELD_NAME: dtx, + CASH_FIELD_NAME: [0] * 10}], + [ShareBuybackAuthorizationsLoader, + {BUYBACK_ANNOUNCEMENT_FIELD_NAME: dtx, + SHARE_COUNT_FIELD_NAME: [0] * 10}]]) + def test_infer_timestamp(self, loader, fields): events_by_sid = { # No timestamp column - should index by first given date - 0: pd.DataFrame({BUYBACK_ANNOUNCEMENT_FIELD_NAME: dtx}), + 0: pd.DataFrame(fields), # timestamp column exists - should index by it - 1: pd.DataFrame( - {BUYBACK_ANNOUNCEMENT_FIELD_NAME: dtx, - TS_FIELD_NAME: dtx} - ) + 1: pd.DataFrame(dict(fields, **{TS_FIELD_NAME: dtx})) } loader = loader( dtx, diff --git a/tests/pipeline/test_earnings.py b/tests/pipeline/test_earnings.py index aecc2c88..308f9977 100644 --- a/tests/pipeline/test_earnings.py +++ b/tests/pipeline/test_earnings.py @@ -386,16 +386,16 @@ class EarningsCalendarLoaderInferTimestampTestCase(TestCase): announcement_dates.keys(), ) assert_series_equal( - pd.Series(loader.events_by_sid[0][ANNOUNCEMENT_FIELD_NAME]), + loader.events_by_sid[0].loc[:, ANNOUNCEMENT_FIELD_NAME], pd.Series(index=[dtx[0]] * 10, data=dtx, name=ANNOUNCEMENT_FIELD_NAME), ) assert_series_equal( - pd.Series(loader.events_by_sid[1][ANNOUNCEMENT_FIELD_NAME]), - pd.Series(index=announcement_dates[1][TS_FIELD_NAME], + loader.events_by_sid[1].loc[:, ANNOUNCEMENT_FIELD_NAME], + pd.Series(index=announcement_dates[1].loc[:, TS_FIELD_NAME], data=np.array( - announcement_dates[1][ANNOUNCEMENT_FIELD_NAME] + announcement_dates[1].loc[:, ANNOUNCEMENT_FIELD_NAME] ), name=ANNOUNCEMENT_FIELD_NAME) ) diff --git a/tests/pipeline/test_events.py b/tests/pipeline/test_events.py new file mode 100644 index 00000000..8710fd29 --- /dev/null +++ b/tests/pipeline/test_events.py @@ -0,0 +1 @@ +__author__ = 'mtydykov' diff --git a/zipline/pipeline/factors/events.py b/zipline/pipeline/factors/events.py index 127aae29..cee82d2d 100644 --- a/zipline/pipeline/factors/events.py +++ b/zipline/pipeline/factors/events.py @@ -42,7 +42,7 @@ class BusinessDaysSincePreviousEvents(Factor): announce_dates = arrays[0].astype(datetime64D_dtype) # Set masked values to NaT. - announce_dates[~mask] = np_NaT + announce_dates[~mask] = NaTD # Convert row labels into a column vector for broadcasted comparison. reference_dates = dates.values.astype(datetime64D_dtype)[:, newaxis] diff --git a/zipline/pipeline/loaders/blaze/buyback_auth.py b/zipline/pipeline/loaders/blaze/buyback_auth.py index e92c3b9c..c5505079 100644 --- a/zipline/pipeline/loaders/blaze/buyback_auth.py +++ b/zipline/pipeline/loaders/blaze/buyback_auth.py @@ -32,7 +32,7 @@ class BlazeCashBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): The timezeone to use for the data query cutoff. dataset: DataSet The DataSet object for which this loader loads data. - loader: EventsLoader + concrete_loader: EventsLoader The reference loader to use for this dataset. Notes @@ -74,7 +74,7 @@ class BlazeCashBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): data_query_time=None, data_query_tz=None, dataset=CashBuybackAuthorizations, - loader=CashBuybackAuthorizationsLoader, + concrete_loader=CashBuybackAuthorizationsLoader, **kwargs): super( BlazeCashBuybackAuthorizationsLoader, self @@ -84,7 +84,7 @@ class BlazeCashBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): data_query_time=data_query_time, data_query_tz=data_query_tz, dataset=dataset, - loader=loader, + concrete_loader=concrete_loader, **kwargs) @@ -106,7 +106,7 @@ class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): The timezeone to use for the data query cutoff. dataset: DataSet The DataSet object for which this loader loads data. - loader: EventsLoader + concrete_loader: EventsLoader The reference loader to use for this dataset. Notes @@ -148,7 +148,7 @@ class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): data_query_time=None, data_query_tz=None, dataset=ShareBuybackAuthorizations, - loader=ShareBuybackAuthorizationsLoader, + concrete_loader=ShareBuybackAuthorizationsLoader, **kwargs): super( BlazeShareBuybackAuthorizationsLoader, self @@ -158,5 +158,5 @@ class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): data_query_time=data_query_time, data_query_tz=data_query_tz, dataset=dataset, - loader=loader, + concrete_loader=concrete_loader, **kwargs) diff --git a/zipline/pipeline/loaders/blaze/earnings.py b/zipline/pipeline/loaders/blaze/earnings.py index a08c2fd8..7483f951 100644 --- a/zipline/pipeline/loaders/blaze/earnings.py +++ b/zipline/pipeline/loaders/blaze/earnings.py @@ -26,7 +26,7 @@ class BlazeEarningsCalendarLoader(BlazeEventsCalendarLoader): The timezeone to use for the data query cutoff. dataset: DataSet The DataSet object for which this loader loads data. - loader: EventsLoader + concrete_loader: EventsLoader The reference loader to use for this dataset. Notes @@ -66,10 +66,11 @@ class BlazeEarningsCalendarLoader(BlazeEventsCalendarLoader): data_query_time=None, data_query_tz=None, dataset=EarningsCalendar, - loader=EarningsCalendarLoader, + concrete_loader=EarningsCalendarLoader, **kwargs): super( BlazeEarningsCalendarLoader, self - ).__init__(expr, dataset=dataset, loader=loader, resources=resources, - odo_kwargs=odo_kwargs, data_query_time=data_query_time, + ).__init__(expr, dataset=dataset, concrete_loader=concrete_loader, + resources=resources, odo_kwargs=odo_kwargs, + data_query_time=data_query_time, data_query_tz=data_query_tz, **kwargs) diff --git a/zipline/pipeline/loaders/buyback_auth.py b/zipline/pipeline/loaders/buyback_auth.py index 038a89c0..538919da 100644 --- a/zipline/pipeline/loaders/buyback_auth.py +++ b/zipline/pipeline/loaders/buyback_auth.py @@ -24,31 +24,25 @@ class CashBuybackAuthorizationsLoader(EventsLoader): event date, cash value)] """ + expected_cols = frozenset([BUYBACK_ANNOUNCEMENT_FIELD_NAME, + CASH_FIELD_NAME]) def __init__(self, all_dates, events_by_sid, infer_timestamps=False, - dataset=CashBuybackAuthorizations): + dataset=CashBuybackAuthorizations, + expected_cols=expected_cols): super(CashBuybackAuthorizationsLoader, self).__init__( all_dates, events_by_sid, infer_timestamps=infer_timestamps, - dataset=dataset + dataset=dataset, + expected_cols=expected_cols, ) - def get_loader(self, column): - """dispatch to the loader for ``column``. - """ - if column is self.dataset.previous_value: - return self.previous_buyback_value_loader - elif column is self.dataset.previous_announcement_date: - return self.previous_event_date_loader - else: - raise ValueError("Don't know how to load column '%s'." % column) - @lazyval - def previous_buyback_value_loader(self): + def previous_value_loader(self): return self._previous_event_value_loader( self.dataset.previous_value, BUYBACK_ANNOUNCEMENT_FIELD_NAME, @@ -56,7 +50,7 @@ class CashBuybackAuthorizationsLoader(EventsLoader): ) @lazyval - def previous_event_date_loader(self): + def previous_announcement_date_loader(self): return self._previous_event_date_loader( self.dataset.previous_announcement_date, BUYBACK_ANNOUNCEMENT_FIELD_NAME, @@ -75,31 +69,25 @@ class ShareBuybackAuthorizationsLoader(EventsLoader): event date, share value)] """ + expected_cols = frozenset([BUYBACK_ANNOUNCEMENT_FIELD_NAME, + SHARE_COUNT_FIELD_NAME]) def __init__(self, all_dates, events_by_sid, infer_timestamps=False, - dataset=ShareBuybackAuthorizations): + dataset=ShareBuybackAuthorizations, + expected_cols=expected_cols): super(ShareBuybackAuthorizationsLoader, self).__init__( all_dates, events_by_sid, infer_timestamps=infer_timestamps, - dataset=dataset + dataset=dataset, + expected_cols=expected_cols, ) - def get_loader(self, column): - """dispatch to the loader for ``column``. - """ - if column is self.dataset.previous_share_count: - return self.previous_buyback_share_count_loader - elif column is self.dataset.previous_announcement_date: - return self.previous_event_date_loader - else: - raise ValueError("Don't know how to load column '%s'." % column) - @lazyval - def previous_buyback_share_count_loader(self): + def previous_share_count_loader(self): return self._previous_event_value_loader( self.dataset.previous_share_count, BUYBACK_ANNOUNCEMENT_FIELD_NAME, @@ -107,7 +95,7 @@ class ShareBuybackAuthorizationsLoader(EventsLoader): ) @lazyval - def previous_event_date_loader(self): + def previous_announcement_date_loader(self): return self._previous_event_date_loader( self.dataset.previous_announcement_date, BUYBACK_ANNOUNCEMENT_FIELD_NAME, diff --git a/zipline/pipeline/loaders/earnings.py b/zipline/pipeline/loaders/earnings.py index e2030430..33c421bb 100644 --- a/zipline/pipeline/loaders/earnings.py +++ b/zipline/pipeline/loaders/earnings.py @@ -10,22 +10,16 @@ ANNOUNCEMENT_FIELD_NAME = "announcement_date" class EarningsCalendarLoader(EventsLoader): - def __init__(self, all_dates, events_by_sid, infer_timestamps=False, - dataset=EarningsCalendar): - super(EarningsCalendarLoader, self).__init__(all_dates, - events_by_sid, - infer_timestamps, - dataset=dataset) + expected_cols = frozenset([ANNOUNCEMENT_FIELD_NAME]) - def get_loader(self, column): - """Dispatch to the loader for ``column``. - """ - if column is self.dataset.next_announcement: - return self.next_announcement_loader - elif column is self.dataset.previous_announcement: - return self.previous_announcement_loader - else: - raise ValueError("Don't know how to load column '%s'." % column) + def __init__(self, all_dates, events_by_sid, + infer_timestamps=False, + dataset=EarningsCalendar, + expected_cols=expected_cols): + super(EarningsCalendarLoader, self).__init__( + all_dates, events_by_sid, infer_timestamps, dataset=dataset, + expected_cols=expected_cols + ) @lazyval def next_announcement_loader(self): diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index 9920a729..9165581c 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -10,6 +10,7 @@ from .frame import DataFrameLoader from .utils import next_date_frame, previous_date_frame, previous_value TS_FIELD_NAME = "timestamp" +SID_FIELD_NAME = "sid" class EventsLoader(PipelineLoader): @@ -32,18 +33,21 @@ class EventsLoader(PipelineLoader): If the DataFrames do not contain a "timestamp" column, we assume we knew about the event on all prior dates. This mode is only supported if ``infer_timestamp`` is explicitly passed as a truthy value. - infer_timestamps : bool, optional Whether to allow omitting the "timestamp" column. + dataset : DataSet + The DataSet object for which this loader loads data. + expected_cols : frozenset + Set of expected columns for the dataset, without timestamp. """ def __init__(self, all_dates, events_by_sid, infer_timestamps=False, - dataset=None): + dataset=None, + expected_cols=frozenset()): self.all_dates = all_dates - # Do not modify the original in place, since it may be used for other # purposes. self.events_by_sid = ( @@ -52,24 +56,52 @@ class EventsLoader(PipelineLoader): dates = self.all_dates.values for k, v in iteritems(events_by_sid): - if "timestamp" not in v.columns: + # First, must convert to DataFrame. + if isinstance(v, pd.Series): + # If Series was passed, DateTime index is assumed. + self.events_by_sid[k] = pd.DataFrame(v) + elif isinstance(v, pd.DatetimeIndex): if not infer_timestamps: raise ValueError( - "Got DataFrame without a 'timestamp' column for " - "sid %d.\n" + "Got DatetimeIndex for sid %d.\n" "Pass `infer_timestamps=True` to use the first date in" - " `all_dates` as implicit timestamp." + " `all_dates` as implicit timestamp."% k ) - self.events_by_sid[k] = v = v.copy() + self.events_by_sid[k] = pd.DataFrame(v) v.index = [dates[0]] * len(v) + # Already a DataFrame + elif isinstance(v, pd.DataFrame): + if TS_FIELD_NAME not in v.columns: + if not infer_timestamps: + raise ValueError( + "Got DataFrame without a '%s' column for sid %d.\n" + "Pass `infer_timestamps=True` to use the first " + "date in `all_dates` as implicit timestamp."% + (TS_FIELD_NAME, k) + ) + self.events_by_sid[k] = v = v.copy() + v.index = [dates[0]] * len(v) + else: + self.events_by_sid[k] = v.set_index(TS_FIELD_NAME) else: - self.events_by_sid[k] = v.set_index("timestamp") - + raise ValueError("Data for sid %s must be in DataFrame, " + "Series, or DatetimeIndex."% k) + # Once data is in a DF, make sure columns are correct. + cols_except_ts = (set(v.columns.values) - + {TS_FIELD_NAME} - + {SID_FIELD_NAME}) + # Check that all columns other than timestamp are as expected. + if cols_except_ts != expected_cols: + raise ValueError( + "Expected columns %s for sid %s but got columns %s." % + (expected_cols, k, v.columns.values) + ) self.dataset = dataset - @abstractmethod - def get_loader(self): - raise NotImplementedError("Must implement 'get_loader'.") + def get_loader(self, column): + if column in self.dataset.columns: + return getattr(self, "%s_loader" % column.name) + raise ValueError("Don't know how to load column '%s'." % column) def load_adjusted_array(self, columns, dates, assets, mask): return merge( diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index ff4ee899..5be70420 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -6,7 +6,7 @@ import pandas as pd from six import iteritems from six.moves import zip -from zipline.utils.numpy_utils import NaTns +from zipline.utils.numpy_utils import NaTns, NaTD def next_date_frame(dates, events_by_sid): @@ -83,7 +83,7 @@ def previous_date_frame(date_index, events_by_sid): next_date_frame """ sids = list(events_by_sid) - out = np.full((len(date_index), len(sids)), np_NaT, dtype='datetime64[ns]') + out = np.full((len(date_index), len(sids)), NaTD, dtype='datetime64[ns]') d_n = date_index[-1].asm8 for col_idx, sid in enumerate(sids): # events_by_sid[sid] is Series mapping knowledge_date to actual From 7ddf18b407ddcef6c9e1bd7d2983cb98352e1251 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Tue, 16 Feb 2016 18:19:32 -0500 Subject: [PATCH 06/12] MAINT: use column's missing value. STY: fix flake8 failures. MAINT: fixes for compatibility with py2. MAINT: fix import error. MAINT: use dict.items() for compatibility with py3. --- tests/pipeline/test_buyback_auth.py | 6 +++--- tests/pipeline/test_earnings.py | 12 +++++++----- zipline/pipeline/loaders/blaze/events.py | 2 +- zipline/pipeline/loaders/buyback_auth.py | 2 +- zipline/pipeline/loaders/earnings.py | 2 +- zipline/pipeline/loaders/events.py | 12 ++++-------- zipline/pipeline/loaders/utils.py | 8 +------- 7 files changed, 18 insertions(+), 26 deletions(-) diff --git a/tests/pipeline/test_buyback_auth.py b/tests/pipeline/test_buyback_auth.py index 7bc9d6c8..5d77aec1 100644 --- a/tests/pipeline/test_buyback_auth.py +++ b/tests/pipeline/test_buyback_auth.py @@ -119,7 +119,7 @@ def zip_with_dates(index_dates, dts): return pd.Series(pd.to_datetime(dts), index=index_dates) -class BuybackAuthLoaderCommonTest: +class BuybackAuthLoaderCommonTest(object): """ Tests for loading the buyback authorization announcement data. """ @@ -294,8 +294,8 @@ class CashBuybackAuthLoaderTestCase(TestCase, BuybackAuthLoaderCommonTest): # until 15? A: zip_with_floats_dates( ['NaN'] * num_days_between(dates, None, '2014-01-14') + - [10] * num_days_between_dates('2014-01-15', '2014-01-19') + - [20] * num_days_between_dates('2014-01-20', None) + [10] * num_days_between_dates('2014-01-15', '2014-01-19') + + [20] * num_days_between_dates('2014-01-20', None) ), B: zip_with_floats_dates( ['NaN'] * num_days_between_dates(None, '2014-01-14') + diff --git a/tests/pipeline/test_earnings.py b/tests/pipeline/test_earnings.py index 308f9977..f48251da 100644 --- a/tests/pipeline/test_earnings.py +++ b/tests/pipeline/test_earnings.py @@ -393,9 +393,11 @@ class EarningsCalendarLoaderInferTimestampTestCase(TestCase): ) assert_series_equal( loader.events_by_sid[1].loc[:, ANNOUNCEMENT_FIELD_NAME], - pd.Series(index=announcement_dates[1].loc[:, TS_FIELD_NAME], - data=np.array( - announcement_dates[1].loc[:, ANNOUNCEMENT_FIELD_NAME] - ), - name=ANNOUNCEMENT_FIELD_NAME) + pd.Series( + index=announcement_dates[1].loc[:, TS_FIELD_NAME], + data=np.array( + announcement_dates[1].loc[:, ANNOUNCEMENT_FIELD_NAME] + ), + name=ANNOUNCEMENT_FIELD_NAME + ) ) diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index 8196377c..6dc0e6da 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -119,4 +119,4 @@ class BlazeEventsCalendarLoader(PipelineLoader): ).load_adjusted_array(columns, dates, assets, mask) def prepare_data(self, raw, gb): - return {sid: raw.loc[group] for sid, group in gb.groups.iteritems()} + return {sid: raw.loc[group] for sid, group in gb.groups.items()} diff --git a/zipline/pipeline/loaders/buyback_auth.py b/zipline/pipeline/loaders/buyback_auth.py index 538919da..d1661bd1 100644 --- a/zipline/pipeline/loaders/buyback_auth.py +++ b/zipline/pipeline/loaders/buyback_auth.py @@ -6,7 +6,7 @@ from ..data.buyback_auth import ( CashBuybackAuthorizations, ShareBuybackAuthorizations ) -from events import EventsLoader +from .events import EventsLoader from zipline.utils.memoize import lazyval diff --git a/zipline/pipeline/loaders/earnings.py b/zipline/pipeline/loaders/earnings.py index 33c421bb..153c5c25 100644 --- a/zipline/pipeline/loaders/earnings.py +++ b/zipline/pipeline/loaders/earnings.py @@ -3,7 +3,7 @@ Reference implementation for EarningsCalendar loaders. """ from ..data.earnings import EarningsCalendar -from events import EventsLoader +from .events import EventsLoader from zipline.utils.memoize import lazyval ANNOUNCEMENT_FIELD_NAME = "announcement_date" diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index 9165581c..1cf4f6f4 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -1,5 +1,3 @@ -from abc import abstractmethod - import numpy as np import pandas as pd from six import iteritems @@ -65,7 +63,7 @@ class EventsLoader(PipelineLoader): raise ValueError( "Got DatetimeIndex for sid %d.\n" "Pass `infer_timestamps=True` to use the first date in" - " `all_dates` as implicit timestamp."% k + " `all_dates` as implicit timestamp." % k ) self.events_by_sid[k] = pd.DataFrame(v) v.index = [dates[0]] * len(v) @@ -76,7 +74,7 @@ class EventsLoader(PipelineLoader): raise ValueError( "Got DataFrame without a '%s' column for sid %d.\n" "Pass `infer_timestamps=True` to use the first " - "date in `all_dates` as implicit timestamp."% + "date in `all_dates` as implicit timestamp." % (TS_FIELD_NAME, k) ) self.events_by_sid[k] = v = v.copy() @@ -85,7 +83,7 @@ class EventsLoader(PipelineLoader): self.events_by_sid[k] = v.set_index(TS_FIELD_NAME) else: raise ValueError("Data for sid %s must be in DataFrame, " - "Series, or DatetimeIndex."% k) + "Series, or DatetimeIndex." % k) # Once data is in a DF, make sure columns are correct. cols_except_ts = (set(v.columns.values) - {TS_FIELD_NAME} - @@ -150,9 +148,7 @@ class EventsLoader(PipelineLoader): event_date_field_name, value_field_name, previous_value_field.dtype, - # TODO: need to get actual name/method to use to get missing - # value - None + previous_value_field.missing_value ), adjustments=None, ) diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 5be70420..1dd685dc 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -1,7 +1,6 @@ import datetime import numpy as np -from numpy import NaN import pandas as pd from six import iteritems from six.moves import zip @@ -125,14 +124,9 @@ def previous_value(date_index, events_by_sid, event_date_field, value_field, next_date_frame """ sids = list(events_by_sid) - # TODO: generalize; need to use dtype of column and missing value for that - # column; so pass - # in the pipeline column's attributes for these (replace NaN and dtype - # below) out = np.full( (len(date_index), len(sids)), - # TODO; replace with missing_value - NaN, + missing_value, dtype=value_field_dtype ) d_n = date_index[-1].asm8 From 8ceff892c595e3c76969e6c41fe2353cb7bb3f71 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Thu, 18 Feb 2016 15:54:35 -0500 Subject: [PATCH 07/12] BUG: specify UTC and no name-checking. DOC: fix doc. MAINT: revert to using getitem. --- tests/pipeline/test_buyback_auth.py | 6 +++--- tests/pipeline/test_earnings.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/pipeline/test_buyback_auth.py b/tests/pipeline/test_buyback_auth.py index 5d77aec1..6c023e9c 100644 --- a/tests/pipeline/test_buyback_auth.py +++ b/tests/pipeline/test_buyback_auth.py @@ -1,5 +1,5 @@ """ -Tests for the reference loader for EarningsCalendar. +Tests for the reference loader for Buyback Authorizations. """ from functools import partial from unittest import TestCase @@ -103,7 +103,7 @@ param_dates = list(gen_calendars( '2014-01-10', '2014-01-15', '2014-01-20', - ]), + ], utc=True), )) @@ -253,7 +253,7 @@ class BuybackAuthLoaderCommonTest(object): for col_name in self.cols.keys(): assert_series_equal(result[col_name].xs(sid, level=1), self.cols[col_name][sid], - sid) + check_names=False) class CashBuybackAuthLoaderTestCase(TestCase, BuybackAuthLoaderCommonTest): diff --git a/tests/pipeline/test_earnings.py b/tests/pipeline/test_earnings.py index f48251da..185801ba 100644 --- a/tests/pipeline/test_earnings.py +++ b/tests/pipeline/test_earnings.py @@ -392,11 +392,11 @@ class EarningsCalendarLoaderInferTimestampTestCase(TestCase): name=ANNOUNCEMENT_FIELD_NAME), ) assert_series_equal( - loader.events_by_sid[1].loc[:, ANNOUNCEMENT_FIELD_NAME], + loader.events_by_sid[1][ANNOUNCEMENT_FIELD_NAME], pd.Series( - index=announcement_dates[1].loc[:, TS_FIELD_NAME], + index=announcement_dates[1][TS_FIELD_NAME], data=np.array( - announcement_dates[1].loc[:, ANNOUNCEMENT_FIELD_NAME] + announcement_dates[1][ANNOUNCEMENT_FIELD_NAME] ), name=ANNOUNCEMENT_FIELD_NAME ) From 2fe126135cf22308cd61a747614d9086593ee47a Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Fri, 19 Feb 2016 11:45:50 -0500 Subject: [PATCH 08/12] MAINT: make concrete_loader a property. concrete_loader should be the same for each instance of a Blaze loader. MAINT: make expected_cols a property. expected_cols should be the same for each instance of a type of loader. --- .../pipeline/loaders/blaze/buyback_auth.py | 13 +++++++--- zipline/pipeline/loaders/blaze/earnings.py | 7 +++-- zipline/pipeline/loaders/blaze/events.py | 14 +++++----- zipline/pipeline/loaders/buyback_auth.py | 26 ++++++++++--------- zipline/pipeline/loaders/earnings.py | 9 ++++--- 5 files changed, 41 insertions(+), 28 deletions(-) diff --git a/zipline/pipeline/loaders/blaze/buyback_auth.py b/zipline/pipeline/loaders/blaze/buyback_auth.py index c5505079..42e00dd7 100644 --- a/zipline/pipeline/loaders/blaze/buyback_auth.py +++ b/zipline/pipeline/loaders/blaze/buyback_auth.py @@ -74,7 +74,6 @@ class BlazeCashBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): data_query_time=None, data_query_tz=None, dataset=CashBuybackAuthorizations, - concrete_loader=CashBuybackAuthorizationsLoader, **kwargs): super( BlazeCashBuybackAuthorizationsLoader, self @@ -84,9 +83,12 @@ class BlazeCashBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): data_query_time=data_query_time, data_query_tz=data_query_tz, dataset=dataset, - concrete_loader=concrete_loader, **kwargs) + @property + def concrete_loader(self): + return CashBuybackAuthorizationsLoader + class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): """A pipeline loader for the ``ShareBuybackAuthorizations`` dataset that loads @@ -141,6 +143,7 @@ class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): SHARE_COUNT_FIELD_NAME, }) + def __init__(self, expr, resources=None, @@ -148,7 +151,6 @@ class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): data_query_time=None, data_query_tz=None, dataset=ShareBuybackAuthorizations, - concrete_loader=ShareBuybackAuthorizationsLoader, **kwargs): super( BlazeShareBuybackAuthorizationsLoader, self @@ -158,5 +160,8 @@ class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): data_query_time=data_query_time, data_query_tz=data_query_tz, dataset=dataset, - concrete_loader=concrete_loader, **kwargs) + + @property + def concrete_loader(self): + return ShareBuybackAuthorizationsLoader \ No newline at end of file diff --git a/zipline/pipeline/loaders/blaze/earnings.py b/zipline/pipeline/loaders/blaze/earnings.py index 7483f951..8588ff03 100644 --- a/zipline/pipeline/loaders/blaze/earnings.py +++ b/zipline/pipeline/loaders/blaze/earnings.py @@ -66,11 +66,14 @@ class BlazeEarningsCalendarLoader(BlazeEventsCalendarLoader): data_query_time=None, data_query_tz=None, dataset=EarningsCalendar, - concrete_loader=EarningsCalendarLoader, **kwargs): super( BlazeEarningsCalendarLoader, self - ).__init__(expr, dataset=dataset, concrete_loader=concrete_loader, + ).__init__(expr, dataset=dataset, resources=resources, odo_kwargs=odo_kwargs, data_query_time=data_query_time, data_query_tz=data_query_tz, **kwargs) + + @property + def concrete_loader(self): + return EarningsCalendarLoader \ No newline at end of file diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index 6dc0e6da..3da16744 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -1,3 +1,5 @@ +import abc + from datashape import istabular from .core import ( @@ -34,8 +36,6 @@ class BlazeEventsCalendarLoader(PipelineLoader): The timezeone to use for the data query cutoff. dataset : DataSet The DataSet object for which this loader loads data. - concrete_loader : - The concrete loader to use for loading data into specified columns. Notes ----- The expression should have a tabular dshape of:: @@ -61,8 +61,7 @@ class BlazeEventsCalendarLoader(PipelineLoader): odo_kwargs=None, data_query_time=None, data_query_tz=None, - dataset=None, - concrete_loader=None): + dataset=None): dshape = expr.dshape if not istabular(dshape): @@ -80,7 +79,10 @@ class BlazeEventsCalendarLoader(PipelineLoader): check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz - self._concrete_loader = concrete_loader + + @abc.abstractproperty + def concrete_loader(self): + raise NotImplementedError("Must specify `concrete_loader`.") def load_adjusted_array(self, columns, dates, assets, mask): data_query_time = self._data_query_time @@ -112,7 +114,7 @@ class BlazeEventsCalendarLoader(PipelineLoader): ts_field=TS_FIELD_NAME, ) gb = raw.groupby(SID_FIELD_NAME) - return self._concrete_loader( + return self.concrete_loader( dates, self.prepare_data(raw, gb), dataset=self._dataset, diff --git a/zipline/pipeline/loaders/buyback_auth.py b/zipline/pipeline/loaders/buyback_auth.py index d1661bd1..ba207697 100644 --- a/zipline/pipeline/loaders/buyback_auth.py +++ b/zipline/pipeline/loaders/buyback_auth.py @@ -10,8 +10,8 @@ from .events import EventsLoader from zipline.utils.memoize import lazyval -BUYBACK_ANNOUNCEMENT_FIELD_NAME = 'buyback_dates' -SHARE_COUNT_FIELD_NAME = 'share_counts' +BUYBACK_ANNOUNCEMENT_FIELD_NAME = 'buyback_date' +SHARE_COUNT_FIELD_NAME = 'share_count' CASH_FIELD_NAME = 'cash' @@ -24,23 +24,23 @@ class CashBuybackAuthorizationsLoader(EventsLoader): event date, cash value)] """ - expected_cols = frozenset([BUYBACK_ANNOUNCEMENT_FIELD_NAME, - CASH_FIELD_NAME]) def __init__(self, all_dates, events_by_sid, infer_timestamps=False, - dataset=CashBuybackAuthorizations, - expected_cols=expected_cols): + dataset=CashBuybackAuthorizations): super(CashBuybackAuthorizationsLoader, self).__init__( all_dates, events_by_sid, infer_timestamps=infer_timestamps, dataset=dataset, - expected_cols=expected_cols, ) + @property + def expected_cols(self): + return frozenset([BUYBACK_ANNOUNCEMENT_FIELD_NAME, CASH_FIELD_NAME]) + @lazyval def previous_value_loader(self): return self._previous_event_value_loader( @@ -69,23 +69,25 @@ class ShareBuybackAuthorizationsLoader(EventsLoader): event date, share value)] """ - expected_cols = frozenset([BUYBACK_ANNOUNCEMENT_FIELD_NAME, - SHARE_COUNT_FIELD_NAME]) def __init__(self, all_dates, events_by_sid, infer_timestamps=False, - dataset=ShareBuybackAuthorizations, - expected_cols=expected_cols): + dataset=ShareBuybackAuthorizations): super(ShareBuybackAuthorizationsLoader, self).__init__( all_dates, events_by_sid, infer_timestamps=infer_timestamps, dataset=dataset, - expected_cols=expected_cols, ) + @property + def expected_cols(self): + return frozenset([BUYBACK_ANNOUNCEMENT_FIELD_NAME, + SHARE_COUNT_FIELD_NAME]) + + @lazyval def previous_share_count_loader(self): return self._previous_event_value_loader( diff --git a/zipline/pipeline/loaders/earnings.py b/zipline/pipeline/loaders/earnings.py index 153c5c25..74d888f2 100644 --- a/zipline/pipeline/loaders/earnings.py +++ b/zipline/pipeline/loaders/earnings.py @@ -10,17 +10,18 @@ ANNOUNCEMENT_FIELD_NAME = "announcement_date" class EarningsCalendarLoader(EventsLoader): - expected_cols = frozenset([ANNOUNCEMENT_FIELD_NAME]) def __init__(self, all_dates, events_by_sid, infer_timestamps=False, - dataset=EarningsCalendar, - expected_cols=expected_cols): + dataset=EarningsCalendar): super(EarningsCalendarLoader, self).__init__( all_dates, events_by_sid, infer_timestamps, dataset=dataset, - expected_cols=expected_cols ) + @property + def expected_cols(self): + return frozenset([ANNOUNCEMENT_FIELD_NAME]) + @lazyval def next_announcement_loader(self): return self._next_event_date_loader(self.dataset.next_announcement, From ae922bf3eeb7aae237e8ea54e1bc4f7999d41a4d Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Fri, 19 Feb 2016 11:48:54 -0500 Subject: [PATCH 09/12] MAINT: modify next_date_frame and prev_date_frame to mirror previous_value. MAINT: clean up and improve docs. BUG: fix imports. MAINT: refactor test. MAINT: change class name. MAINT: remove error since won't be reached. TST: improve and expand tests. MAINT: change class name. MAINT: change class name. MAINT: extract string constants and remove error that won't be reached. STY: fix line length. MAINT: undo name change. --- tests/pipeline/test_buyback_auth.py | 47 ----- tests/pipeline/test_earnings.py | 36 ---- tests/pipeline/test_events.py | 192 +++++++++++++++++- .../pipeline/loaders/blaze/buyback_auth.py | 9 +- zipline/pipeline/loaders/blaze/earnings.py | 6 +- zipline/pipeline/loaders/blaze/events.py | 4 +- zipline/pipeline/loaders/buyback_auth.py | 1 - zipline/pipeline/loaders/events.py | 97 +++++---- zipline/pipeline/loaders/utils.py | 28 ++- 9 files changed, 280 insertions(+), 140 deletions(-) diff --git a/tests/pipeline/test_buyback_auth.py b/tests/pipeline/test_buyback_auth.py index 6c023e9c..26fff9a3 100644 --- a/tests/pipeline/test_buyback_auth.py +++ b/tests/pipeline/test_buyback_auth.py @@ -462,50 +462,3 @@ class BlazeCashBuybackAuthLoaderNotInteractiveTestCase( self, ).loader_args(dates) return swap_resources_into_scope(bound_expr, {}) - -dtx = pd.date_range('2014-01-01', '2014-01-10') - - -class BuybackAuthLoaderInferTimestampTestCase(TestCase): - # 'fields' needs to match expected fields for the given loader to - # satisfy column check in constructor. - @parameterized.expand([[CashBuybackAuthorizationsLoader, - {BUYBACK_ANNOUNCEMENT_FIELD_NAME: dtx, - CASH_FIELD_NAME: [0] * 10}], - [ShareBuybackAuthorizationsLoader, - {BUYBACK_ANNOUNCEMENT_FIELD_NAME: dtx, - SHARE_COUNT_FIELD_NAME: [0] * 10}]]) - def test_infer_timestamp(self, loader, fields): - events_by_sid = { - # No timestamp column - should index by first given date - 0: pd.DataFrame(fields), - # timestamp column exists - should index by it - 1: pd.DataFrame(dict(fields, **{TS_FIELD_NAME: dtx})) - } - loader = loader( - dtx, - events_by_sid, - infer_timestamps=True, - ) - self.assertEqual( - loader.events_by_sid.keys(), - events_by_sid.keys(), - ) - - # Check that index by first given date has been added - assert_series_equal( - loader.events_by_sid[0][BUYBACK_ANNOUNCEMENT_FIELD_NAME], - pd.Series(index=[dtx[0]] * 10, - data=dtx, - name=BUYBACK_ANNOUNCEMENT_FIELD_NAME), - ) - - # Check that timestamp column was turned into index - modified_events_by_sid_date_col = pd.Series(data=np.array( - events_by_sid[1][BUYBACK_ANNOUNCEMENT_FIELD_NAME]), - index=events_by_sid[1][TS_FIELD_NAME], - name=BUYBACK_ANNOUNCEMENT_FIELD_NAME) - assert_series_equal( - loader.events_by_sid[1][BUYBACK_ANNOUNCEMENT_FIELD_NAME], - modified_events_by_sid_date_col, - ) diff --git a/tests/pipeline/test_earnings.py b/tests/pipeline/test_earnings.py index 185801ba..7f9986b8 100644 --- a/tests/pipeline/test_earnings.py +++ b/tests/pipeline/test_earnings.py @@ -365,39 +365,3 @@ class BlazeEarningsCalendarLoaderNotInteractiveTestCase( self, ).loader_args(dates) return swap_resources_into_scope(bound_expr, {}) - - -class EarningsCalendarLoaderInferTimestampTestCase(TestCase): - def test_infer_timestamp(self): - dtx = pd.date_range('2014-01-01', '2014-01-10') - announcement_dates = { - 0: pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx}), - 1: pd.DataFrame( - {TS_FIELD_NAME: dtx, ANNOUNCEMENT_FIELD_NAME: dtx} - ), - } - loader = EarningsCalendarLoader( - dtx, - announcement_dates, - infer_timestamps=True, - ) - self.assertEqual( - loader.events_by_sid.keys(), - announcement_dates.keys(), - ) - assert_series_equal( - loader.events_by_sid[0].loc[:, ANNOUNCEMENT_FIELD_NAME], - pd.Series(index=[dtx[0]] * 10, - data=dtx, - name=ANNOUNCEMENT_FIELD_NAME), - ) - assert_series_equal( - loader.events_by_sid[1][ANNOUNCEMENT_FIELD_NAME], - pd.Series( - index=announcement_dates[1][TS_FIELD_NAME], - data=np.array( - announcement_dates[1][ANNOUNCEMENT_FIELD_NAME] - ), - name=ANNOUNCEMENT_FIELD_NAME - ) - ) diff --git a/tests/pipeline/test_events.py b/tests/pipeline/test_events.py index 8710fd29..62df1ea3 100644 --- a/tests/pipeline/test_events.py +++ b/tests/pipeline/test_events.py @@ -1 +1,191 @@ -__author__ = 'mtydykov' +""" +Tests for setting up an EventsLoader and a BlazeEventsLoader. +""" +from nose_parameterized import parameterized + +import blaze as bz +import pandas as pd +from pandas.util.testing import assert_series_equal, TestCase, assertRaises + +from zipline.pipeline.data import DataSet, Column +from zipline.pipeline.loaders.blaze.events import BlazeEventsLoader +from zipline.pipeline.loaders.events import ( + BAD_DATA_FORMAT_ERROR, + DF_NO_TS_NOT_INFER_TS_ERROR, + DTINDEX_NOT_INFER_TS_ERROR, + EventsLoader, + SERIES_NO_DTINDEX_ERROR, + SID_FIELD_NAME, + TS_FIELD_NAME, + WRONG_COLS_ERROR, +) +from zipline.utils.memoize import lazyval +from zipline.utils.numpy_utils import datetime64ns_dtype + +ABSTRACT_METHODS_ERROR = 'abstract methods concrete_loader' + +DAYS_SINCE_PREV = 'days_since_prev' + +PREVIOUS_ANNOUNCEMENT = 'previous_announcement' + +ANNOUNCEMENT_FIELD_NAME = 'announcement_date' + + +class EventDataSet(DataSet): + previous_announcement = Column(datetime64ns_dtype) + + +class EventDataSetLoader(EventsLoader): + + def __init__(self, + all_dates, + events_by_sid, + infer_timestamps=False, + dataset=EventDataSet): + super(EventDataSetLoader, self).__init__( + all_dates, + events_by_sid, + infer_timestamps=infer_timestamps, + dataset=dataset, + ) + + @property + def expected_cols(self): + return frozenset([ANNOUNCEMENT_FIELD_NAME]) + + @lazyval + def previous_announcement_loader(self): + return self._previous_event_date_loader( + self.dataset.previous_announcement, + ANNOUNCEMENT_FIELD_NAME, + ) + + @lazyval + def next_announcement_loader(self): + return self._previous_event_date_loader( + self.dataset.previous_announcement, + ANNOUNCEMENT_FIELD_NAME, + ) + + +class EventDataSetLoaderNoExpectedCols(EventsLoader): + + def __init__(self, + all_dates, + events_by_sid, + infer_timestamps=False, + dataset=EventDataSet): + super(EventDataSetLoaderNoExpectedCols, self).__init__( + all_dates, + events_by_sid, + infer_timestamps=infer_timestamps, + dataset=dataset, + ) + + +dtx = pd.date_range('2014-01-01', '2014-01-10') + + +def assert_loader_error(events_by_sid, error, msg, infer_timestamps=True): + with assertRaises(error) as context: + EventDataSetLoader( + dtx, events_by_sid, infer_timestamps=infer_timestamps, + ) + assert msg in context.exception + + +class EventLoaderTestCase(TestCase): + + def test_no_expected_cols_defined(self): + events_by_sid = {0: pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx})} + assert_loader_error(events_by_sid, TypeError, ABSTRACT_METHODS_ERROR) + + def test_wrong_cols(self): + wrong_col_name = 'some_other_col' + # Test wrong cols (cols != expected) + events_by_sid = {0: pd.DataFrame({wrong_col_name: dtx})} + assert_loader_error( + events_by_sid, ValueError, WRONG_COLS_ERROR % ( + EventDataSetLoader.expected_cols, 0, wrong_col_name + ) + ) + + @parameterized.expand([ + # DataFrame without timestamp column and infer_timestamps = True + [pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx}), True], + # DataFrame with timestamp column + [pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx, + TS_FIELD_NAME: dtx}), False], + # DatetimeIndex with infer_timestamps = True + [pd.DatetimeIndex(dtx, name=ANNOUNCEMENT_FIELD_NAME), True], + # Series with DatetimeIndex as index and infer_timestamps = False + [pd.Series(dtx, index=dtx, name=ANNOUNCEMENT_FIELD_NAME), False] + ]) + def test_conversion_to_df(self, df, infer_timestamps): + + events_by_sid = {0: df} + loader = EventDataSetLoader( + dtx, + events_by_sid, + infer_timestamps=infer_timestamps, + ) + self.assertEqual( + loader.events_by_sid.keys(), + events_by_sid.keys(), + ) + + if infer_timestamps: + expected = pd.Series(index=[dtx[0]] * 10, data=dtx, ) + else: + expected = pd.Series(index=dtx, data=dtx,) + # Check that index by first given date has been added + assert_series_equal( + loader.events_by_sid[0][ANNOUNCEMENT_FIELD_NAME], + expected, + check_names=False + ) + + @parameterized.expand([ + # DataFrame without timestamp column and infer_timestamps = True + [pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx}), False, + DF_NO_TS_NOT_INFER_TS_ERROR % (TS_FIELD_NAME, 0)], + # DatetimeIndex with infer_timestamps = False + [pd.DatetimeIndex(dtx, name=ANNOUNCEMENT_FIELD_NAME), False, + DTINDEX_NOT_INFER_TS_ERROR % 0], + # Series with DatetimeIndex as index and infer_timestamps = False + [pd.Series(dtx, name=ANNOUNCEMENT_FIELD_NAME), False, + SERIES_NO_DTINDEX_ERROR % 0], + # Some other data structure that is not expected + [dtx, False, BAD_DATA_FORMAT_ERROR % 0], + [dtx, True, BAD_DATA_FORMAT_ERROR % 0] + ]) + def test_bad_conversion_to_df(self, df, infer_timestamps, msg): + events_by_sid = {0: df} + assert_loader_error(events_by_sid, ValueError, msg, + infer_timestamps=infer_timestamps) + + +class BlazeEventDataSetLoaderNoConcreteLoader(BlazeEventsLoader): + def __init__(self, + expr, + dataset=EventDataSet, + **kwargs): + super( + BlazeEventDataSetLoaderNoConcreteLoader, self + ).__init__(expr, + dataset=dataset, + **kwargs) + + +class BlazeEventLoaderTestCase(TestCase): + # Blaze loader: need to test failure if no concrete loader + def test_no_concrete_loader_defined(self): + with assertRaises(TypeError) as context: + BlazeEventDataSetLoaderNoConcreteLoader( + bz.Data( + pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx, + SID_FIELD_NAME: 0 + }) + ) + ) + assert ABSTRACT_METHODS_ERROR in context.exception diff --git a/zipline/pipeline/loaders/blaze/buyback_auth.py b/zipline/pipeline/loaders/blaze/buyback_auth.py index 42e00dd7..20dbfcda 100644 --- a/zipline/pipeline/loaders/blaze/buyback_auth.py +++ b/zipline/pipeline/loaders/blaze/buyback_auth.py @@ -11,10 +11,10 @@ from zipline.pipeline.loaders.buyback_auth import ( ShareBuybackAuthorizationsLoader, SHARE_COUNT_FIELD_NAME ) -from .events import BlazeEventsCalendarLoader +from .events import BlazeEventsLoader -class BlazeCashBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): +class BlazeCashBuybackAuthorizationsLoader(BlazeEventsLoader): """A pipeline loader for the ``CashBuybackAuthorizations`` dataset that loads data from a blaze expression. @@ -90,7 +90,7 @@ class BlazeCashBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): return CashBuybackAuthorizationsLoader -class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): +class BlazeShareBuybackAuthorizationsLoader(BlazeEventsLoader): """A pipeline loader for the ``ShareBuybackAuthorizations`` dataset that loads data from a blaze expression. @@ -143,7 +143,6 @@ class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): SHARE_COUNT_FIELD_NAME, }) - def __init__(self, expr, resources=None, @@ -164,4 +163,4 @@ class BlazeShareBuybackAuthorizationsLoader(BlazeEventsCalendarLoader): @property def concrete_loader(self): - return ShareBuybackAuthorizationsLoader \ No newline at end of file + return ShareBuybackAuthorizationsLoader diff --git a/zipline/pipeline/loaders/blaze/earnings.py b/zipline/pipeline/loaders/blaze/earnings.py index 8588ff03..06d16738 100644 --- a/zipline/pipeline/loaders/blaze/earnings.py +++ b/zipline/pipeline/loaders/blaze/earnings.py @@ -5,10 +5,10 @@ from .core import ( ) from zipline.pipeline.data import EarningsCalendar from zipline.pipeline.loaders.earnings import EarningsCalendarLoader -from .events import BlazeEventsCalendarLoader +from .events import BlazeEventsLoader -class BlazeEarningsCalendarLoader(BlazeEventsCalendarLoader): +class BlazeEarningsCalendarLoader(BlazeEventsLoader): """A pipeline loader for the ``EarningsCalendar`` dataset that loads data from a blaze expression. @@ -76,4 +76,4 @@ class BlazeEarningsCalendarLoader(BlazeEventsCalendarLoader): @property def concrete_loader(self): - return EarningsCalendarLoader \ No newline at end of file + return EarningsCalendarLoader diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index 3da16744..70850251 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -18,7 +18,7 @@ from zipline.utils.input_validation import ensure_timezone, optionally from zipline.utils.preprocess import preprocess -class BlazeEventsCalendarLoader(PipelineLoader): +class BlazeEventsLoader(PipelineLoader): """An abstract pipeline loader for the events datasets that loads data from a blaze expression. @@ -82,7 +82,7 @@ class BlazeEventsCalendarLoader(PipelineLoader): @abc.abstractproperty def concrete_loader(self): - raise NotImplementedError("Must specify `concrete_loader`.") + pass def load_adjusted_array(self, columns, dates, assets, mask): data_query_time = self._data_query_time diff --git a/zipline/pipeline/loaders/buyback_auth.py b/zipline/pipeline/loaders/buyback_auth.py index ba207697..eb14689e 100644 --- a/zipline/pipeline/loaders/buyback_auth.py +++ b/zipline/pipeline/loaders/buyback_auth.py @@ -87,7 +87,6 @@ class ShareBuybackAuthorizationsLoader(EventsLoader): return frozenset([BUYBACK_ANNOUNCEMENT_FIELD_NAME, SHARE_COUNT_FIELD_NAME]) - @lazyval def previous_share_count_loader(self): return self._previous_event_value_loader( diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index 1cf4f6f4..9c18611a 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -1,4 +1,4 @@ -import numpy as np +import abc import pandas as pd from six import iteritems from toolz import merge @@ -7,6 +7,23 @@ from .base import PipelineLoader from .frame import DataFrameLoader from .utils import next_date_frame, previous_date_frame, previous_value +WRONG_COLS_ERROR = "Expected columns %s for sid %s but got columns %s." + +BAD_DATA_FORMAT_ERROR = ("Data for sid %s must be in DataFrame, " + "Series, or DatetimeIndex.") + +SERIES_NO_DTINDEX_ERROR = ("Got Series for sid %d, but index was not " + "DatetimeIndex.") + +DTINDEX_NOT_INFER_TS_ERROR = ("Got DatetimeIndex for sid %d.\n" + "Pass `infer_timestamps=True` to use the first " + "date in `all_dates` as implicit timestamp.") + +DF_NO_TS_NOT_INFER_TS_ERROR = ("Got DataFrame without a '%r' column for sid " + "%d.\nPass `infer_timestamps=True` to use the " + "first date in `all_dates` as implicit " + "timestamp.") + TS_FIELD_NAME = "timestamp" SID_FIELD_NAME = "sid" @@ -21,16 +38,29 @@ class EventsLoader(PipelineLoader): ---------- all_dates : pd.DatetimeIndex Index of dates for which we can serve queries. - events_by_sid : dict[int -> pd.DataFrame] - Dict mapping sids to DataFrames representing dates on which events - occurred along with other associated values. + events_by_sid : dict[int -> pd.DataFrame], dict[int -> pd.Series], + or dict[int -> pd.DatetimeIndex] + Dict mapping sids to objects representing dates on which earnings + occurred. - If the DataFrames contain a "timestamp" column, that column is - interpreted as the date on which we learned about the event. + If a dict value is a Series, it's interpreted as a mapping from the + date on which we learned an announcement was coming to the date on + which the announcement was made. + + If a dict value is a DatetimeIndex, it's interpreted as just containing + the dates that announcements were made, and we assume we knew about the + announcement on all prior dates. This mode is only supported if + ``infer_timestamp`` is explicitly passed as a truthy value. + Dict mapping sids to DataFrames, Series, or DatetimeIndexes. + + If the value is a DataFrame, it then represents dates on which events + occurred along with other associated values. If the DataFrame + contains a "timestamp" column, that column is interpreted as the date + on which we learned about the event. If the DataFrames do not contain a + "timestamp" column, we assume we knew about the event on all prior + dates. This mode is only supported if ``infer_timestamp`` is + explicitly passed as a truthy value. - If the DataFrames do not contain a "timestamp" column, we assume we - knew about the event on all prior dates. This mode is only supported - if ``infer_timestamp`` is explicitly passed as a truthy value. infer_timestamps : bool, optional Whether to allow omitting the "timestamp" column. dataset : DataSet @@ -39,12 +69,15 @@ class EventsLoader(PipelineLoader): Set of expected columns for the dataset, without timestamp. """ + @abc.abstractproperty + def expected_cols(self): + pass + def __init__(self, all_dates, events_by_sid, infer_timestamps=False, - dataset=None, - expected_cols=frozenset()): + dataset=None): self.all_dates = all_dates # Do not modify the original in place, since it may be used for other # purposes. @@ -56,25 +89,25 @@ class EventsLoader(PipelineLoader): for k, v in iteritems(events_by_sid): # First, must convert to DataFrame. if isinstance(v, pd.Series): - # If Series was passed, DateTime index is assumed. - self.events_by_sid[k] = pd.DataFrame(v) + if not isinstance(v.index, pd.DatetimeIndex): + raise ValueError( + SERIES_NO_DTINDEX_ERROR % k + ) + self.events_by_sid[k] = v = pd.DataFrame(v) elif isinstance(v, pd.DatetimeIndex): if not infer_timestamps: raise ValueError( - "Got DatetimeIndex for sid %d.\n" - "Pass `infer_timestamps=True` to use the first date in" - " `all_dates` as implicit timestamp." % k + DTINDEX_NOT_INFER_TS_ERROR % k ) - self.events_by_sid[k] = pd.DataFrame(v) - v.index = [dates[0]] * len(v) + self.events_by_sid[k] = v = pd.DataFrame( + v, index=[dates[0]] * len(v) + ) # Already a DataFrame elif isinstance(v, pd.DataFrame): if TS_FIELD_NAME not in v.columns: if not infer_timestamps: raise ValueError( - "Got DataFrame without a '%s' column for sid %d.\n" - "Pass `infer_timestamps=True` to use the first " - "date in `all_dates` as implicit timestamp." % + DF_NO_TS_NOT_INFER_TS_ERROR % (TS_FIELD_NAME, k) ) self.events_by_sid[k] = v = v.copy() @@ -82,17 +115,16 @@ class EventsLoader(PipelineLoader): else: self.events_by_sid[k] = v.set_index(TS_FIELD_NAME) else: - raise ValueError("Data for sid %s must be in DataFrame, " - "Series, or DatetimeIndex." % k) + raise ValueError(BAD_DATA_FORMAT_ERROR % k) # Once data is in a DF, make sure columns are correct. - cols_except_ts = (set(v.columns.values) - + cols_except_ts = (set(v.columns) - {TS_FIELD_NAME} - {SID_FIELD_NAME}) # Check that all columns other than timestamp are as expected. - if cols_except_ts != expected_cols: + if cols_except_ts != self.expected_cols: raise ValueError( - "Expected columns %s for sid %s but got columns %s." % - (expected_cols, k, v.columns.values) + WRONG_COLS_ERROR % + (self.expected_cols, k, v.columns.values) ) self.dataset = dataset @@ -109,17 +141,13 @@ class EventsLoader(PipelineLoader): for column in columns ) - def mk_date_series(self, date_field_name): - return {sid: pd.Series(index=event.index, - data=np.array(event[date_field_name])) - for sid, event in iteritems(self.events_by_sid)} - def _next_event_date_loader(self, next_date_field, event_date_field_name): return DataFrameLoader( next_date_field, next_date_frame( self.all_dates, - self.mk_date_series(event_date_field_name), + self.events_by_sid, + event_date_field_name ), adjustments=None, ) @@ -131,7 +159,8 @@ class EventsLoader(PipelineLoader): prev_date_field, previous_date_frame( self.all_dates, - self.mk_date_series(event_date_field_name), + self.events_by_sid, + event_date_field_name, ), adjustments=None, ) diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 1dd685dc..73d0ad3f 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -8,7 +8,7 @@ from six.moves import zip from zipline.utils.numpy_utils import NaTns, NaTD -def next_date_frame(dates, events_by_sid): +def next_date_frame(dates, events_by_sid, event_date_field_name): """ Make a DataFrame representing the simulated next known date for an event. @@ -20,6 +20,9 @@ def next_date_frame(dates, events_by_sid): Dict mapping sids to a series of dates. Each k:v pair of the series represents the date we learned of the event mapping to the date the event will occur. + event_date_field_name : str + The name of the date field that marks when the event occurred. + Returns ------- next_events: pd.DataFrame @@ -37,7 +40,8 @@ def next_date_frame(dates, events_by_sid): equity: np.full_like(dates, NaTns) for equity in events_by_sid } raw_dates = dates.values - for equity, event_dates in iteritems(events_by_sid): + for equity, df in iteritems(events_by_sid): + event_dates = df[event_date_field_name] data = cols[equity] if not event_dates.index.is_monotonic_increasing: event_dates = event_dates.sort_index() @@ -56,7 +60,7 @@ def next_date_frame(dates, events_by_sid): return pd.DataFrame(index=dates, data=cols) -def previous_date_frame(date_index, events_by_sid): +def previous_date_frame(date_index, events_by_sid, event_date_field_name): """ Make a DataFrame representing simulated next earnings date_index. @@ -64,18 +68,20 @@ def previous_date_frame(date_index, events_by_sid): ---------- date_index : DatetimeIndex. The index of the returned DataFrame. - events_by_sid : dict[int -> DatetimeIndex] - Dict mapping sids to a series of dates. Each k:v pair of the series - represents the date we learned of the event mapping to the date the - event will occur. + events_by_sid : dict[int -> pd.DataFrame] + Dict mapping sids to a DataFrame. The index of the DataFrame + represents the date we learned of the event mapping to the event + data. + event_date_field_name : str + The name of the date field that marks when the event occurred. Returns ------- previous_events: pd.DataFrame A DataFrame where each column is a security from `events_by_sid` where - the values are the dates of the previous event that occured on the date - of the index. Entries falling before the first date will have `NaT` as - the result in the output. + the values are the dates of the previous event that occurred on the + date of the index. Entries falling before the first date will have + `NaT` as the result in the output. See Also -------- @@ -88,7 +94,7 @@ def previous_date_frame(date_index, events_by_sid): # events_by_sid[sid] is Series mapping knowledge_date to actual # event_date. We don't care about the knowledge date for # computing previous earnings. - values = events_by_sid[sid].values + values = events_by_sid[sid][event_date_field_name].values values = values[values <= d_n] out[date_index.searchsorted(values), col_idx] = values From e257dc1da93cd112d500d826cce65a5f1e8b3ea3 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Mon, 22 Feb 2016 09:15:02 -0500 Subject: [PATCH 10/12] MAINT: refactor constants to common file. TST: refactor constants and clarify comments. TST: clean up/extract constants in tests. MAINT: add/modify constants. MAINT: remove obsolete and alphabetize. MAINT: clean up. MAINT: modify constants to have named params. MAINT: extract constants. STY: fix indentation. MAINT: refactor common part out of buyback_auth. MAINT: refactor earnings test logic. --- tests/pipeline/test_buyback_auth.py | 286 ++++----------- tests/pipeline/test_earnings.py | 346 ++++-------------- tests/pipeline/test_events.py | 229 ++++++++++-- zipline/pipeline/common.py | 17 + zipline/pipeline/loaders/blaze/__init__.py | 19 +- .../pipeline/loaders/blaze/buyback_auth.py | 8 +- zipline/pipeline/loaders/blaze/core.py | 10 +- zipline/pipeline/loaders/blaze/earnings.py | 6 +- zipline/pipeline/loaders/blaze/events.py | 11 +- zipline/pipeline/loaders/buyback_auth.py | 18 +- zipline/pipeline/loaders/earnings.py | 3 +- zipline/pipeline/loaders/events.py | 45 +-- 12 files changed, 403 insertions(+), 595 deletions(-) create mode 100644 zipline/pipeline/common.py diff --git a/tests/pipeline/test_buyback_auth.py b/tests/pipeline/test_buyback_auth.py index 26fff9a3..10908f87 100644 --- a/tests/pipeline/test_buyback_auth.py +++ b/tests/pipeline/test_buyback_auth.py @@ -12,8 +12,19 @@ import numpy as np import pandas as pd from pandas.util.testing import assert_series_equal from six import iteritems +from tests.pipeline.test_events import param_dates, EventLoaderCommonTest from zipline.pipeline import Pipeline +from zipline.pipeline.common import( + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + CASH_FIELD_NAME, + DAYS_SINCE_PREV, + PREVIOUS_BUYBACK_ANNOUNCEMENT, + PREVIOUS_BUYBACK_CASH, + PREVIOUS_BUYBACK_SHARE_COUNT, + SHARE_COUNT_FIELD_NAME, + SID_FIELD_NAME, + TS_FIELD_NAME) from zipline.pipeline.data import (CashBuybackAuthorizations, ShareBuybackAuthorizations) from zipline.pipeline.engine import SimplePipelineEngine @@ -26,11 +37,6 @@ from zipline.pipeline.loaders.buyback_auth import \ from zipline.pipeline.loaders.blaze import ( BlazeCashBuybackAuthorizationsLoader, BlazeShareBuybackAuthorizationsLoader, - BUYBACK_ANNOUNCEMENT_FIELD_NAME, - SHARE_COUNT_FIELD_NAME, - SID_FIELD_NAME, - TS_FIELD_NAME, - CASH_FIELD_NAME ) from zipline.utils.numpy_utils import make_datetime64D, NaTD from zipline.utils.test_utils import ( @@ -41,34 +47,26 @@ from zipline.utils.test_utils import ( ) -sids = A, B, C, D, E = range(5) - -equity_info = make_simple_equity_info( - sids, - start_date=pd.Timestamp('2013-01-01', tz='UTC'), - end_date=pd.Timestamp('2015-01-01', tz='UTC'), -) - -buyback_authorizations = { - # K1--K2--A1--A2--SC1--SC2--V1--V2. - A: pd.DataFrame({ - "timestamp": pd.to_datetime(['2014-01-05', '2014-01-10']), +buyback_authorizations = [ + # K1--K2--A1--A2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-15', '2014-01-20']), SHARE_COUNT_FIELD_NAME: [1, 15], CASH_FIELD_NAME: [10, 20] }), - # K1--K2--E2--E1. - B: pd.DataFrame({ - "timestamp": pd.to_datetime(['2014-01-05', '2014-01-10']), + # K1--K2--A2--A1. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ '2014-01-20', '2014-01-15' ]), SHARE_COUNT_FIELD_NAME: [7, 13], CASH_FIELD_NAME: [10, 22] }), - # K1--E1--K2--E2. - C: pd.DataFrame({ - "timestamp": pd.to_datetime(['2014-01-05', '2014-01-15']), + # K1--A1--K2--A2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-15']), BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ '2014-01-10', '2014-01-20' ]), @@ -76,196 +74,34 @@ buyback_authorizations = { CASH_FIELD_NAME: [4, 7] }), # K1 == K2. - D: pd.DataFrame({ - "timestamp": pd.to_datetime(['2014-01-05'] * 2), + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05'] * 2), BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ '2014-01-10', '2014-01-15' ]), SHARE_COUNT_FIELD_NAME: [6, 23], CASH_FIELD_NAME: [1, 2] }), - E: pd.DataFrame( - columns=["timestamp", + pd.DataFrame( + columns=[TS_FIELD_NAME, BUYBACK_ANNOUNCEMENT_FIELD_NAME, SHARE_COUNT_FIELD_NAME, CASH_FIELD_NAME], dtype='datetime64[ns]' ), -} - -# Must be a list - can't use generator since this needs to be used more than -# once. -param_dates = list(gen_calendars( - '2014-01-01', - '2014-01-31', - critical_dates=pd.to_datetime([ - '2014-01-05', - '2014-01-10', - '2014-01-15', - '2014-01-20', - ], utc=True), -)) +] -def zip_with_floats(dates, flts): - return pd.Series(flts, index=dates).astype('float') - - -def num_days_between(dates, start_date, end_date): - return num_days_in_range(dates, start_date, end_date) - - -def zip_with_dates(index_dates, dts): - return pd.Series(pd.to_datetime(dts), index=index_dates) - - -class BuybackAuthLoaderCommonTest(object): - """ - Tests for loading the buyback authorization announcement data. - """ - - def loader_args(self, dates): - """Construct the base buyback authorizations object to pass to the - loader. - - Parameters - ---------- - dates : pd.DatetimeIndex - The dates we can serve. - - Returns - ------- - args : tuple[any] - The arguments to forward to the loader positionally. - """ - return dates, self.buyback_authorizations - - def setup_engine(self, dates): - """ - Make a Pipeline Enigne object based on the given dates. - """ - loader = self.loader_type(*self.loader_args(dates)) - return SimplePipelineEngine(lambda _: loader, dates, self.finder) - - def setup_expected_cols(self, dates): - """ - Make expectation functions for the given dates calendar. - - This exists to make it easy to test our various cases with critical - dates missing from the calendar. - """ - num_days_between_for_dates = partial(num_days_between, dates) - zip_with_dates_for_dates = partial(zip_with_dates, dates) - _expected_previous_buyback_announcement = pd.DataFrame({ - A: zip_with_dates_for_dates( - ['NaT'] * num_days_between_for_dates(None, '2014-01-14') + - ['2014-01-15'] * num_days_between_for_dates('2014-01-15', - '2014-01-19') + - ['2014-01-20'] * num_days_between_for_dates('2014-01-20', - None), - ), - B: zip_with_dates_for_dates( - ['NaT'] * num_days_between_for_dates(None, '2014-01-14') + - ['2014-01-15'] * num_days_between_for_dates('2014-01-15', - '2014-01-19') + - ['2014-01-20'] * num_days_between_for_dates('2014-01-20', - None), - ), - C: zip_with_dates_for_dates( - ['NaT'] * num_days_between_for_dates(None, '2014-01-09') + - ['2014-01-10'] * num_days_between_for_dates('2014-01-10', - '2014-01-19') + - ['2014-01-20'] * num_days_between_for_dates('2014-01-20', - None), - ), - D: zip_with_dates_for_dates( - ['NaT'] * num_days_between_for_dates(None, '2014-01-09') + - ['2014-01-10'] * num_days_between_for_dates('2014-01-10', - '2014-01-14') + - ['2014-01-15'] * num_days_between_for_dates('2014-01-15', - None), - ), - E: zip_with_dates_for_dates(['NaT'] * len(dates)), - }, index=dates) - - _expected_previous_busday_offsets = self._compute_busday_offsets( - _expected_previous_buyback_announcement - ) - - # Common cols for buyback authorization datasets are announcement - # date and days since previous. - self.cols[ - 'previous_buyback_announcement' - ] = _expected_previous_buyback_announcement - self.cols['days_since_prev'] = _expected_previous_busday_offsets - - @staticmethod - def _compute_busday_offsets(announcement_dates): - """ - Compute expected business day offsets from a DataFrame of announcement - dates. - """ - # Column-vector of dates on which factor `compute` will be called. - raw_call_dates = announcement_dates.index.values.astype( - 'datetime64[D]' - )[:, None] - - # 2D array of dates containining expected nexg announcement. - raw_announce_dates = ( - announcement_dates.values.astype('datetime64[D]') - ) - - # Set NaTs to 0 temporarily because busday_count doesn't support NaT. - # We fill these entries with NaNs later. - whereNaT = raw_announce_dates == NaTD - raw_announce_dates[whereNaT] = make_datetime64D(0) - - # The abs call here makes it so that we can use this function to - # compute offsets for both next and previous earnings (previous - # earnings offsets come back negative). - expected = abs(np.busday_count( - raw_call_dates, - raw_announce_dates - ).astype(float)) - - expected[whereNaT] = np.nan - return pd.DataFrame( - data=expected, - columns=announcement_dates.columns, - index=announcement_dates.index, - ) - - def _test_compute_buyback_auth(self, dates): - engine = self.setup_engine(dates) - self.setup_expected_cols(dates) - - pipe = Pipeline( - columns=self.pipeline_columns - ) - - result = engine.run_pipeline( - pipe, - start_date=dates[0], - end_date=dates[-1], - ) - - for sid in sids: - for col_name in self.cols.keys(): - assert_series_equal(result[col_name].xs(sid, level=1), - self.cols[col_name][sid], - check_names=False) - - -class CashBuybackAuthLoaderTestCase(TestCase, BuybackAuthLoaderCommonTest): +class CashBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonTest): """ Test for cash buyback authorizations dataset. """ pipeline_columns = { - 'previous_buyback_cash': + ('%s' % PREVIOUS_BUYBACK_CASH): CashBuybackAuthorizations.previous_value.latest, - 'previous_buyback_announcement': + PREVIOUS_BUYBACK_ANNOUNCEMENT: CashBuybackAuthorizations.previous_announcement_date.latest, - 'days_since_prev': + DAYS_SINCE_PREV: BusinessDaysSincePreviousCashBuybackAuth(), } @@ -273,12 +109,11 @@ class CashBuybackAuthLoaderTestCase(TestCase, BuybackAuthLoaderCommonTest): def setUpClass(cls): cls._cleanup_stack = stack = ExitStack() cls.finder = stack.enter_context( - tmp_asset_finder(equities=equity_info), + tmp_asset_finder(equities=cls.equity_info), ) cls.cols = {} - cls.buyback_authorizations = {sid: df.drop(SHARE_COUNT_FIELD_NAME, 1) - for sid, df in - iteritems(buyback_authorizations)} + cls.dataset = {sid: df.drop(SHARE_COUNT_FIELD_NAME, 1) + for sid, df in enumerate(buyback_authorizations)} cls.loader_type = CashBuybackAuthorizationsLoader @classmethod @@ -286,51 +121,53 @@ class CashBuybackAuthLoaderTestCase(TestCase, BuybackAuthLoaderCommonTest): cls._cleanup_stack.close() def setup(self, dates): - zip_with_floats_dates = partial(zip_with_floats, dates) - num_days_between_dates = partial(num_days_between, dates) - super(CashBuybackAuthLoaderTestCase, self).setup_expected_cols(dates) + zip_with_floats_dates = partial(self.zip_with_floats, dates) + num_days_between_dates = partial(self.num_days_between, dates) _expected_previous_cash = pd.DataFrame({ # TODO if the next knowledge date is 10, why is the range # until 15? - A: zip_with_floats_dates( - ['NaN'] * num_days_between(dates, None, '2014-01-14') + + 0: zip_with_floats_dates( + ['NaN'] * num_days_between_dates(None, '2014-01-14') + [10] * num_days_between_dates('2014-01-15', '2014-01-19') + [20] * num_days_between_dates('2014-01-20', None) ), - B: zip_with_floats_dates( + 1: zip_with_floats_dates( ['NaN'] * num_days_between_dates(None, '2014-01-14') + [22] * num_days_between_dates('2014-01-15', '2014-01-19') + [10] * num_days_between_dates('2014-01-20', None) ), - C: zip_with_floats_dates( + 2: zip_with_floats_dates( ['NaN'] * num_days_between_dates(None, '2014-01-09') + [4] * num_days_between_dates('2014-01-10', '2014-01-19') + [7] * num_days_between_dates('2014-01-20', None) ), - D: zip_with_floats_dates( + 3: zip_with_floats_dates( ['NaN'] * num_days_between_dates(None, '2014-01-09') + [1] * num_days_between_dates('2014-01-10', '2014-01-14') + [2] * num_days_between_dates('2014-01-15', None) ), - E: zip_with_floats_dates(['NaN'] * len(dates)), + 4: zip_with_floats_dates(['NaN'] * len(dates)), }, index=dates) - self.cols['previous_buyback_cash'] = _expected_previous_cash + self.cols[PREVIOUS_BUYBACK_ANNOUNCEMENT] = self.get_expected_previous( + dates) + self.cols[PREVIOUS_BUYBACK_CASH] = _expected_previous_cash + @parameterized.expand(param_dates) def test_compute_cash_buyback_auth(self, dates): self._test_compute_buyback_auth(dates) -class ShareBuybackAuthLoaderTestCase(BuybackAuthLoaderCommonTest, TestCase): +class ShareBuybackAuthLoaderTestCase(EventLoaderCommonTest, TestCase): """ Test for share buyback authorizations dataset. """ pipeline_columns = { - 'previous_buyback_share_count': + ('%s' % PREVIOUS_BUYBACK_SHARE_COUNT): ShareBuybackAuthorizations.previous_share_count.latest, - 'previous_buyback_announcement': + ('%s' % PREVIOUS_BUYBACK_ANNOUNCEMENT): ShareBuybackAuthorizations.previous_announcement_date.latest, - 'days_since_prev': + DAYS_SINCE_PREV: BusinessDaysSincePreviousShareBuybackAuth(), } @@ -338,12 +175,12 @@ class ShareBuybackAuthLoaderTestCase(BuybackAuthLoaderCommonTest, TestCase): def setUpClass(cls): cls._cleanup_stack = stack = ExitStack() cls.finder = stack.enter_context( - tmp_asset_finder(equities=equity_info), + tmp_asset_finder(equities=cls.equity_info), ) cls.cols = {} - cls.buyback_authorizations = {sid: df.drop(CASH_FIELD_NAME, 1) + cls.dataset = {sid: df.drop(CASH_FIELD_NAME, 1) for sid, df in - iteritems(buyback_authorizations)} + enumerate(buyback_authorizations)} cls.loader_type = ShareBuybackAuthorizationsLoader @classmethod @@ -351,35 +188,36 @@ class ShareBuybackAuthLoaderTestCase(BuybackAuthLoaderCommonTest, TestCase): cls._cleanup_stack.close() def setup(self, dates): - zip_with_floats_dates = partial(zip_with_floats, dates) - num_days_between_dates = partial(num_days_between, dates) - super(ShareBuybackAuthLoaderTestCase, self).setup_expected_cols(dates) + zip_with_floats_dates = partial(self.zip_with_floats, dates) + num_days_between_dates = partial(self.num_days_between, dates) _expected_previous_buyback_share_count = pd.DataFrame({ - A: zip_with_floats_dates( + 0: zip_with_floats_dates( ['NaN'] * num_days_between_dates(None, '2014-01-14') + [1] * num_days_between_dates('2014-01-15', '2014-01-19') + [15] * num_days_between_dates('2014-01-20', None) ), - B: zip_with_floats_dates( + 1: zip_with_floats_dates( ['NaN'] * num_days_between_dates(None, '2014-01-14') + [13] * num_days_between_dates('2014-01-15', '2014-01-19') + [7] * num_days_between_dates('2014-01-20', None) ), - C: zip_with_floats_dates( + 2: zip_with_floats_dates( ['NaN'] * num_days_between_dates(None, '2014-01-09') + [3] * num_days_between_dates('2014-01-10', '2014-01-19') + [1] * num_days_between_dates('2014-01-20', None) ), - D: zip_with_floats_dates( + 3: zip_with_floats_dates( ['NaN'] * num_days_between_dates(None, '2014-01-09') + [6] * num_days_between_dates('2014-01-10', '2014-01-14') + [23] * num_days_between_dates('2014-01-15', None) ), - E: zip_with_floats_dates(['NaN'] * len(dates)), + 4: zip_with_floats_dates(['NaN'] * len(dates)), }, index=dates) self.cols[ - 'previous_buyback_share_count' + PREVIOUS_BUYBACK_SHARE_COUNT ] = _expected_previous_buyback_share_count + self.cols[PREVIOUS_BUYBACK_ANNOUNCEMENT] = \ + self.get_expected_previous(dates) @parameterized.expand(param_dates) def test_compute_share_buyback_auth(self, dates): diff --git a/tests/pipeline/test_earnings.py b/tests/pipeline/test_earnings.py index 7f9986b8..f5242267 100644 --- a/tests/pipeline/test_earnings.py +++ b/tests/pipeline/test_earnings.py @@ -7,173 +7,101 @@ import blaze as bz from blaze.compute.core import swap_resources_into_scope from contextlib2 import ExitStack from nose_parameterized import parameterized -import numpy as np import pandas as pd -from pandas.util.testing import assert_series_equal from six import iteritems +from tests.pipeline.test_events import EventLoaderCommonTest, param_dates -from zipline.pipeline import Pipeline +from zipline.pipeline.common import ( + ANNOUNCEMENT_FIELD_NAME, + DAYS_SINCE_PREV, + DAYS_TO_NEXT, + NEXT_ANNOUNCEMENT, + PREVIOUS_ANNOUNCEMENT, + SID_FIELD_NAME, + TS_FIELD_NAME +) from zipline.pipeline.data import EarningsCalendar -from zipline.pipeline.engine import SimplePipelineEngine from zipline.pipeline.factors.events import ( BusinessDaysSincePreviousEarnings, BusinessDaysUntilNextEarnings, ) from zipline.pipeline.loaders.earnings import EarningsCalendarLoader from zipline.pipeline.loaders.blaze import ( - ANNOUNCEMENT_FIELD_NAME, BlazeEarningsCalendarLoader, - SID_FIELD_NAME, - TS_FIELD_NAME, ) -from zipline.utils.numpy_utils import make_datetime64D, NaTD from zipline.utils.test_utils import ( - gen_calendars, make_simple_equity_info, - num_days_in_range, tmp_asset_finder, ) +earnings_dates = [ + # K1--K2--E1--E2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-15', + '2014-01-20']) + }), + # K1--K2--E2--E1. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-20', + '2014-01-15']) + }), + # K1--E1--K2--E2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-15']), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', + '2014-01-20']) + }), + # K1 == K2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05'] * 2), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', + '2014-01-15']) + }), + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime([]), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([]) + }) + ] -class EarningsCalendarLoaderTestCase(TestCase): + +class EarningsCalendarLoaderTestCase(TestCase, EventLoaderCommonTest): """ Tests for loading the earnings announcement data. """ - loader_type = EarningsCalendarLoader + pipeline_columns = { + NEXT_ANNOUNCEMENT: EarningsCalendar.next_announcement.latest, + PREVIOUS_ANNOUNCEMENT: EarningsCalendar.previous_announcement.latest, + DAYS_SINCE_PREV: BusinessDaysSincePreviousEarnings(), + DAYS_TO_NEXT: BusinessDaysUntilNextEarnings(), + } @classmethod def setUpClass(cls): cls._cleanup_stack = stack = ExitStack() - cls.sids = A, B, C, D, E = range(5) equity_info = make_simple_equity_info( cls.sids, start_date=pd.Timestamp('2013-01-01', tz='UTC'), end_date=pd.Timestamp('2015-01-01', tz='UTC'), ) + cls.cols = {} + cls.dataset = {sid: df for sid, df in enumerate(earnings_dates)} cls.finder = stack.enter_context( tmp_asset_finder(equities=equity_info), ) - cls.earnings_dates = { - # K1--K2--E1--E2. - A: pd.DataFrame({ - "timestamp": pd.to_datetime(['2014-01-05', '2014-01-10']), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-15', - '2014-01-20']) - }), - # K1--K2--E2--E1. - B: pd.DataFrame({ - "timestamp": pd.to_datetime(['2014-01-05', '2014-01-10']), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-20', - '2014-01-15']) - }), - # K1--E1--K2--E2. - C: pd.DataFrame({ - "timestamp": pd.to_datetime(['2014-01-05', '2014-01-15']), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', - '2014-01-20']) - }), - # K1 == K2. - D: pd.DataFrame({ - "timestamp": pd.to_datetime(['2014-01-05'] * 2), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', - '2014-01-15']) - }), - E: pd.DataFrame({ - "timestamp": pd.to_datetime([]), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([]) - }) - } + cls.loader_type = EarningsCalendarLoader @classmethod def tearDownClass(cls): cls._cleanup_stack.close() - def loader_args(self, dates): - """Construct the base earnings announcements object to pass to the - loader. - - Parameters - ---------- - dates : pd.DatetimeIndex - The dates we can serve. - - Returns - ------- - args : tuple[any] - The arguments to forward to the loader positionally. - """ - return dates, self.earnings_dates def setup(self, dates): - """ - Make a PipelineEngine and expectation functions for the given dates - calendar. + _expected_next_announce = self.get_expected_next_event_dates(dates) - This exists to make it easy to test our various cases with critical - dates missing from the calendar. - """ - A, B, C, D, E = self.sids - - def num_days_between(start_date, end_date): - return num_days_in_range(dates, start_date, end_date) - - def zip_with_dates(dts): - return pd.Series(pd.to_datetime(dts), index=dates) - - _expected_next_announce = pd.DataFrame({ - A: zip_with_dates( - ['NaT'] * num_days_between(None, '2014-01-04') + - ['2014-01-15'] * num_days_between('2014-01-05', '2014-01-15') + - ['2014-01-20'] * num_days_between('2014-01-16', '2014-01-20') + - ['NaT'] * num_days_between('2014-01-21', None) - ), - B: zip_with_dates( - ['NaT'] * num_days_between(None, '2014-01-04') + - ['2014-01-20'] * num_days_between('2014-01-05', '2014-01-09') + - ['2014-01-15'] * num_days_between('2014-01-10', '2014-01-15') + - ['2014-01-20'] * num_days_between('2014-01-16', '2014-01-20') + - ['NaT'] * num_days_between('2014-01-21', None) - ), - C: zip_with_dates( - ['NaT'] * num_days_between(None, '2014-01-04') + - ['2014-01-10'] * num_days_between('2014-01-05', '2014-01-10') + - ['NaT'] * num_days_between('2014-01-11', '2014-01-14') + - ['2014-01-20'] * num_days_between('2014-01-15', '2014-01-20') + - ['NaT'] * num_days_between('2014-01-21', None) - ), - D: zip_with_dates( - ['NaT'] * num_days_between(None, '2014-01-04') + - ['2014-01-10'] * num_days_between('2014-01-05', '2014-01-10') + - ['2014-01-15'] * num_days_between('2014-01-11', '2014-01-15') + - ['NaT'] * num_days_between('2014-01-16', None) - ), - E: zip_with_dates(['NaT'] * len(dates)), - }, index=dates) - - _expected_previous_announce = pd.DataFrame({ - A: zip_with_dates( - ['NaT'] * num_days_between(None, '2014-01-14') + - ['2014-01-15'] * num_days_between('2014-01-15', '2014-01-19') + - ['2014-01-20'] * num_days_between('2014-01-20', None) - ), - B: zip_with_dates( - ['NaT'] * num_days_between(None, '2014-01-14') + - ['2014-01-15'] * num_days_between('2014-01-15', '2014-01-19') + - ['2014-01-20'] * num_days_between('2014-01-20', None) - ), - C: zip_with_dates( - ['NaT'] * num_days_between(None, '2014-01-09') + - ['2014-01-10'] * num_days_between('2014-01-10', '2014-01-19') + - ['2014-01-20'] * num_days_between('2014-01-20', None) - ), - D: zip_with_dates( - ['NaT'] * num_days_between(None, '2014-01-09') + - ['2014-01-10'] * num_days_between('2014-01-10', '2014-01-14') + - ['2014-01-15'] * num_days_between('2014-01-15', None) - ), - E: zip_with_dates(['NaT'] * len(dates)), - }, index=dates) + _expected_previous_announce = self.get_expected_previous_event_dates(dates) _expected_next_busday_offsets = self._compute_busday_offsets( _expected_next_announce @@ -181,164 +109,21 @@ class EarningsCalendarLoaderTestCase(TestCase): _expected_previous_busday_offsets = self._compute_busday_offsets( _expected_previous_announce ) + self.cols[PREVIOUS_ANNOUNCEMENT] = _expected_previous_announce + self.cols[NEXT_ANNOUNCEMENT] = _expected_next_announce + self.cols[DAYS_TO_NEXT] = _expected_next_busday_offsets + self.cols[DAYS_SINCE_PREV] = _expected_previous_busday_offsets - def expected_next_announce(sid): - """ - Return the expected next announcement dates for ``sid``. - """ - return _expected_next_announce[sid] - - def expected_next_busday_offset(sid): - """ - Return the expected number of days to the next announcement for - ``sid``. - """ - return _expected_next_busday_offsets[sid] - - def expected_previous_announce(sid): - """ - Return the expected previous announcement dates for ``sid``. - """ - return _expected_previous_announce[sid] - - def expected_previous_busday_offset(sid): - """ - Return the expected number of days to the next announcement for - ``sid``. - """ - return _expected_previous_busday_offsets[sid] - - loader = self.loader_type(*self.loader_args(dates)) - engine = SimplePipelineEngine(lambda _: loader, dates, self.finder) - return ( - engine, - expected_next_announce, - expected_next_busday_offset, - expected_previous_announce, - expected_previous_busday_offset, - ) - - @staticmethod - def _compute_busday_offsets(announcement_dates): - """ - Compute expected business day offsets from a DataFrame of announcement - dates. - """ - # Column-vector of dates on which factor `compute` will be called. - raw_call_dates = announcement_dates.index.values.astype( - 'datetime64[D]' - )[:, None] - - # 2D array of dates containining expected nexg announcement. - raw_announce_dates = ( - announcement_dates.values.astype('datetime64[D]') - ) - - # Set NaTs to 0 temporarily because busday_count doesn't support NaT. - # We fill these entries with NaNs later. - whereNaT = raw_announce_dates == NaTD - raw_announce_dates[whereNaT] = make_datetime64D(0) - - # The abs call here makes it so that we can use this function to - # compute offsets for both next and previous earnings (previous - # earnings offsets come back negative). - expected = abs(np.busday_count( - raw_call_dates, - raw_announce_dates - ).astype(float)) - - expected[whereNaT] = np.nan - return pd.DataFrame( - data=expected, - columns=announcement_dates.columns, - index=announcement_dates.index, - ) - - @parameterized.expand(gen_calendars( - '2014-01-01', - '2014-01-31', - critical_dates=pd.to_datetime([ - '2014-01-05', - '2014-01-10', - '2014-01-15', - '2014-01-20', - ], utc=True), - )) + @parameterized.expand(param_dates) def test_compute_earnings(self, dates): - - ( - engine, - expected_next, - expected_next_busday_offset, - expected_previous, - expected_previous_busday_offset, - ) = self.setup(dates) - - pipe = Pipeline( - columns={ - 'next': EarningsCalendar.next_announcement.latest, - 'previous': EarningsCalendar.previous_announcement.latest, - 'days_to_next': BusinessDaysUntilNextEarnings(), - 'days_since_prev': BusinessDaysSincePreviousEarnings(), - } - ) - - result = engine.run_pipeline( - pipe, - start_date=dates[0], - end_date=dates[-1], - ) - - computed_next = result['next'] - computed_previous = result['previous'] - computed_next_busday_offset = result['days_to_next'] - computed_previous_busday_offset = result['days_since_prev'] - - # NaTs in next/prev should correspond to NaNs in offsets. - assert_series_equal( - computed_next.isnull(), - computed_next_busday_offset.isnull(), - check_names=False, - ) - assert_series_equal( - computed_previous.isnull(), - computed_previous_busday_offset.isnull(), - check_names=False, - ) - - for sid in self.sids: - - assert_series_equal( - computed_next.xs(sid, level=1), - expected_next(sid), - sid, - check_names=False, - ) - - assert_series_equal( - computed_previous.xs(sid, level=1), - expected_previous(sid), - sid, - check_names=False, - ) - - assert_series_equal( - computed_next_busday_offset.xs(sid, level=1), - expected_next_busday_offset(sid), - sid, - check_names=False, - ) - - assert_series_equal( - computed_previous_busday_offset.xs(sid, level=1), - expected_previous_busday_offset(sid), - sid, - check_names=False, - ) + self._test_compute(dates) class BlazeEarningsCalendarLoaderTestCase(EarningsCalendarLoaderTestCase): - loader_type = BlazeEarningsCalendarLoader + @classmethod + def setUpClass(cls): + super(BlazeEarningsCalendarLoaderTestCase, cls).setUpClass() + cls.loader_type = BlazeEarningsCalendarLoader def loader_args(self, dates): _, mapping = super( @@ -359,6 +144,11 @@ class BlazeEarningsCalendarLoaderNotInteractiveTestCase( BlazeEarningsCalendarLoaderTestCase): """Test case for passing a non-interactive symbol and a dict of resources. """ + @classmethod + def setUpClass(cls): + super(BlazeEarningsCalendarLoaderNotInteractiveTestCase, cls).setUpClass() + cls.loader_type = BlazeEarningsCalendarLoader + def loader_args(self, dates): (bound_expr,) = super( BlazeEarningsCalendarLoaderNotInteractiveTestCase, diff --git a/tests/pipeline/test_events.py b/tests/pipeline/test_events.py index 62df1ea3..33c328ec 100644 --- a/tests/pipeline/test_events.py +++ b/tests/pipeline/test_events.py @@ -1,12 +1,20 @@ """ Tests for setting up an EventsLoader and a BlazeEventsLoader. """ +from functools import partial from nose_parameterized import parameterized import blaze as bz +import numpy as np import pandas as pd -from pandas.util.testing import assert_series_equal, TestCase, assertRaises +from pandas.util.testing import assert_series_equal, TestCase +from zipline.pipeline import SimplePipelineEngine, Pipeline +from zipline.pipeline.common import ( + ANNOUNCEMENT_FIELD_NAME, + SID_FIELD_NAME, + TS_FIELD_NAME +) from zipline.pipeline.data import DataSet, Column from zipline.pipeline.loaders.blaze.events import BlazeEventsLoader from zipline.pipeline.loaders.events import ( @@ -15,21 +23,15 @@ from zipline.pipeline.loaders.events import ( DTINDEX_NOT_INFER_TS_ERROR, EventsLoader, SERIES_NO_DTINDEX_ERROR, - SID_FIELD_NAME, - TS_FIELD_NAME, WRONG_COLS_ERROR, ) from zipline.utils.memoize import lazyval -from zipline.utils.numpy_utils import datetime64ns_dtype +from zipline.utils.numpy_utils import datetime64ns_dtype, NaTD, make_datetime64D +from zipline.utils.test_utils import gen_calendars, num_days_in_range, \ + make_simple_equity_info ABSTRACT_METHODS_ERROR = 'abstract methods concrete_loader' -DAYS_SINCE_PREV = 'days_since_prev' - -PREVIOUS_ANNOUNCEMENT = 'previous_announcement' - -ANNOUNCEMENT_FIELD_NAME = 'announcement_date' - class EventDataSet(DataSet): previous_announcement = Column(datetime64ns_dtype) @@ -86,28 +88,32 @@ class EventDataSetLoaderNoExpectedCols(EventsLoader): dtx = pd.date_range('2014-01-01', '2014-01-10') -def assert_loader_error(events_by_sid, error, msg, infer_timestamps=True): - with assertRaises(error) as context: +def assert_loader_error(events_by_sid, error, msg, infer_timestamps): + with TestCase.assertRaises(error) as context: EventDataSetLoader( dtx, events_by_sid, infer_timestamps=infer_timestamps, ) - assert msg in context.exception + TestCase.assertTrue(msg in context.exception) class EventLoaderTestCase(TestCase): def test_no_expected_cols_defined(self): events_by_sid = {0: pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx})} - assert_loader_error(events_by_sid, TypeError, ABSTRACT_METHODS_ERROR) + assert_loader_error(events_by_sid, TypeError, ABSTRACT_METHODS_ERROR, + True) def test_wrong_cols(self): wrong_col_name = 'some_other_col' # Test wrong cols (cols != expected) events_by_sid = {0: pd.DataFrame({wrong_col_name: dtx})} assert_loader_error( - events_by_sid, ValueError, WRONG_COLS_ERROR % ( - EventDataSetLoader.expected_cols, 0, wrong_col_name - ) + events_by_sid, ValueError, WRONG_COLS_ERROR.format( + expected_columns=EventDataSetLoader.expected_cols, + sid=0, + resulting_columns=wrong_col_name, + ), + True ) @parameterized.expand([ @@ -135,29 +141,36 @@ class EventLoaderTestCase(TestCase): ) if infer_timestamps: - expected = pd.Series(index=[dtx[0]] * 10, data=dtx, ) + expected = pd.Series(index=[dtx[0]] * 10, data=dtx, + name=ANNOUNCEMENT_FIELD_NAME) else: - expected = pd.Series(index=dtx, data=dtx,) + expected = pd.Series(index=dtx, data=dtx, + name=ANNOUNCEMENT_FIELD_NAME) + expected.index.name = TS_FIELD_NAME # Check that index by first given date has been added assert_series_equal( loader.events_by_sid[0][ANNOUNCEMENT_FIELD_NAME], expected, - check_names=False ) @parameterized.expand([ # DataFrame without timestamp column and infer_timestamps = True - [pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx}), False, - DF_NO_TS_NOT_INFER_TS_ERROR % (TS_FIELD_NAME, 0)], + [pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx}), + False, + DF_NO_TS_NOT_INFER_TS_ERROR.format( + timestamp_column_name=TS_FIELD_NAME, + sid=0 + ) + ], # DatetimeIndex with infer_timestamps = False [pd.DatetimeIndex(dtx, name=ANNOUNCEMENT_FIELD_NAME), False, - DTINDEX_NOT_INFER_TS_ERROR % 0], + DTINDEX_NOT_INFER_TS_ERROR.format(sid=0)], # Series with DatetimeIndex as index and infer_timestamps = False [pd.Series(dtx, name=ANNOUNCEMENT_FIELD_NAME), False, - SERIES_NO_DTINDEX_ERROR % 0], + SERIES_NO_DTINDEX_ERROR.format(sid=0)], # Some other data structure that is not expected - [dtx, False, BAD_DATA_FORMAT_ERROR % 0], - [dtx, True, BAD_DATA_FORMAT_ERROR % 0] + [dtx, False, BAD_DATA_FORMAT_ERROR.format(sid=0)], + [dtx, True, BAD_DATA_FORMAT_ERROR.format(sid=0)] ]) def test_bad_conversion_to_df(self, df, infer_timestamps, msg): events_by_sid = {0: df} @@ -180,7 +193,7 @@ class BlazeEventDataSetLoaderNoConcreteLoader(BlazeEventsLoader): class BlazeEventLoaderTestCase(TestCase): # Blaze loader: need to test failure if no concrete loader def test_no_concrete_loader_defined(self): - with assertRaises(TypeError) as context: + with TestCase.assertRaises(TypeError) as context: BlazeEventDataSetLoaderNoConcreteLoader( bz.Data( pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx, @@ -188,4 +201,164 @@ class BlazeEventLoaderTestCase(TestCase): }) ) ) - assert ABSTRACT_METHODS_ERROR in context.exception + TestCase.assertTrue(ABSTRACT_METHODS_ERROR in context.exception) + + + + + + + +########################## + + +# Must be a list - can't use generator since this needs to be used more than +# once. +param_dates = list(gen_calendars( + '2014-01-01', + '2014-01-31', + critical_dates=pd.to_datetime([ + '2014-01-05', + '2014-01-10', + '2014-01-15', + '2014-01-20', + ], utc=True), +)) + + +class EventLoaderCommonTest(object): + sids = A, B, C, D, E = range(5) + equity_info = make_simple_equity_info( + sids, + start_date=pd.Timestamp('2013-01-01', tz='UTC'), + end_date=pd.Timestamp('2015-01-01', tz='UTC'), + ) + + def zip_with_floats(self, dates, flts): + return pd.Series(flts, index=dates).astype('float') + + + def num_days_between(self, dates, start_date, end_date): + return num_days_in_range(dates, start_date, end_date) + + + def zip_with_dates(self, index_dates, dts): + return pd.Series(pd.to_datetime(dts), index=index_dates) + + + def loader_args(self, dates): + """Construct the base object to pass to the loader. + + Parameters + ---------- + dates : pd.DatetimeIndex + The dates we can serve. + + Returns + ------- + args : tuple[any] + The arguments to forward to the loader positionally. + """ + return dates, self.dataset + + def setup_engine(self, dates): + """ + Make a Pipeline Enigne object based on the given dates. + """ + loader = self.loader_type(*self.loader_args(dates)) + return SimplePipelineEngine(lambda _: loader, dates, self.finder) + + def get_expected_previous(self, dates): + num_days_between_for_dates = partial(self.num_days_between, dates) + zip_with_dates_for_dates = partial(self.zip_with_dates, dates) + return pd.DataFrame({ + 0: zip_with_dates_for_dates( + ['NaT'] * num_days_between_for_dates(None, '2014-01-14') + + ['2014-01-15'] * num_days_between_for_dates('2014-01-15', + '2014-01-19') + + ['2014-01-20'] * num_days_between_for_dates('2014-01-20', + None), + ), + 1: zip_with_dates_for_dates( + ['NaT'] * num_days_between_for_dates(None, '2014-01-14') + + ['2014-01-15'] * num_days_between_for_dates('2014-01-15', + '2014-01-19') + + ['2014-01-20'] * num_days_between_for_dates('2014-01-20', + None), + ), + 2: zip_with_dates_for_dates( + ['NaT'] * num_days_between_for_dates(None, '2014-01-09') + + ['2014-01-10'] * num_days_between_for_dates('2014-01-10', + '2014-01-19') + + ['2014-01-20'] * num_days_between_for_dates('2014-01-20', + None), + ), + 3: zip_with_dates_for_dates( + ['NaT'] * num_days_between_for_dates(None, '2014-01-09') + + ['2014-01-10'] * num_days_between_for_dates('2014-01-10', + '2014-01-14') + + ['2014-01-15'] * num_days_between_for_dates('2014-01-15', + None), + ), + 4: zip_with_dates_for_dates(['NaT'] * len(dates)), + }, index=dates) + + @staticmethod + def _compute_busday_offsets(announcement_dates): + """ + Compute expected business day offsets from a DataFrame of announcement + dates. + """ + # Column-vector of dates on which factor `compute` will be called. + raw_call_dates = announcement_dates.index.values.astype( + 'datetime64[D]' + )[:, None] + + # 2D array of dates containining expected nexg announcement. + raw_announce_dates = ( + announcement_dates.values.astype('datetime64[D]') + ) + + # Set NaTs to 0 temporarily because busday_count doesn't support NaT. + # We fill these entries with NaNs later. + whereNaT = raw_announce_dates == NaTD + raw_announce_dates[whereNaT] = make_datetime64D(0) + + # The abs call here makes it so that we can use this function to + # compute offsets for both next and previous earnings (previous + # earnings offsets come back negative). + expected = abs(np.busday_count( + raw_call_dates, + raw_announce_dates + ).astype(float)) + + expected[whereNaT] = np.nan + return pd.DataFrame( + data=expected, + columns=announcement_dates.columns, + index=announcement_dates.index, + ) + + def _test_compute_buyback_auth(self, dates): + engine = self.setup_engine(dates) + self.setup(dates) + + pipe = Pipeline( + columns=self.pipeline_columns + ) + + result = engine.run_pipeline( + pipe, + start_date=dates[0], + end_date=dates[-1], + ) + + for sid in self.sids: + for col_name in self.cols.keys(): + assert_series_equal(result[col_name].xs(sid, level=1), + self.cols[col_name][sid], + check_names=False) + + + + diff --git a/zipline/pipeline/common.py b/zipline/pipeline/common.py new file mode 100644 index 00000000..de225409 --- /dev/null +++ b/zipline/pipeline/common.py @@ -0,0 +1,17 @@ +""" +Common constants for Pipeline. +""" +AD_FIELD_NAME = 'asof_date' +ANNOUNCEMENT_FIELD_NAME = 'announcement_date' +CASH_FIELD_NAME = 'cash' +BUYBACK_ANNOUNCEMENT_FIELD_NAME = 'buyback_date' +DAYS_SINCE_PREV = 'days_since_prev' +DAYS_TO_NEXT = 'days_to_next' +NEXT_ANNOUNCEMENT = 'next_announcement' +PREVIOUS_ANNOUNCEMENT = 'previous_announcement' +PREVIOUS_BUYBACK_ANNOUNCEMENT = 'previous_buyback_announcement' +PREVIOUS_BUYBACK_CASH = 'previous_buyback_cash' +PREVIOUS_BUYBACK_SHARE_COUNT = 'previous_buyback_share_count' +SHARE_COUNT_FIELD_NAME = 'share_count' +SID_FIELD_NAME = 'sid' +TS_FIELD_NAME = 'timestamp' diff --git a/zipline/pipeline/loaders/blaze/__init__.py b/zipline/pipeline/loaders/blaze/__init__.py index 301cbc7d..838e3577 100644 --- a/zipline/pipeline/loaders/blaze/__init__.py +++ b/zipline/pipeline/loaders/blaze/__init__.py @@ -4,37 +4,22 @@ from .buyback_auth import ( BlazeShareBuybackAuthorizationsLoader ) from .core import ( - AD_FIELD_NAME, BlazeLoader, NoDeltasWarning, - SID_FIELD_NAME, - TS_FIELD_NAME, from_blaze, global_loader, ) -from .buyback_auth import ( - BUYBACK_ANNOUNCEMENT_FIELD_NAME, - SHARE_COUNT_FIELD_NAME, - CASH_FIELD_NAME -) + from .earnings import ( - ANNOUNCEMENT_FIELD_NAME, BlazeEarningsCalendarLoader, ) __all__ = ( - 'AD_FIELD_NAME', - 'ANNOUNCEMENT_FIELD_NAME', 'BlazeCashBuybackAuthorizationsLoader', 'BlazeEarningsCalendarLoader', 'BlazeLoader', 'BlazeShareBuybackAuthorizationsLoader', - 'BUYBACK_ANNOUNCEMENT_FIELD_NAME', - 'NoDeltasWarning', - 'SHARE_COUNT_FIELD_NAME', - 'SID_FIELD_NAME', - 'TS_FIELD_NAME', - 'CASH_FIELD_NAME', 'from_blaze', 'global_loader', + 'NoDeltasWarning', ) diff --git a/zipline/pipeline/loaders/blaze/buyback_auth.py b/zipline/pipeline/loaders/blaze/buyback_auth.py index 20dbfcda..52305e30 100644 --- a/zipline/pipeline/loaders/blaze/buyback_auth.py +++ b/zipline/pipeline/loaders/blaze/buyback_auth.py @@ -2,14 +2,16 @@ from .core import ( TS_FIELD_NAME, SID_FIELD_NAME, ) +from zipline.pipeline.common import ( + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + CASH_FIELD_NAME, + SHARE_COUNT_FIELD_NAME +) from zipline.pipeline.data import (CashBuybackAuthorizations, ShareBuybackAuthorizations) from zipline.pipeline.loaders.buyback_auth import ( - BUYBACK_ANNOUNCEMENT_FIELD_NAME, CashBuybackAuthorizationsLoader, - CASH_FIELD_NAME, ShareBuybackAuthorizationsLoader, - SHARE_COUNT_FIELD_NAME ) from .events import BlazeEventsLoader diff --git a/zipline/pipeline/loaders/blaze/core.py b/zipline/pipeline/loaders/blaze/core.py index df34c975..6a97540e 100644 --- a/zipline/pipeline/loaders/blaze/core.py +++ b/zipline/pipeline/loaders/blaze/core.py @@ -158,7 +158,11 @@ from toolz import ( ) import toolz.curried.operator as op - +from zipline.pipeline.common import ( + AD_FIELD_NAME, + SID_FIELD_NAME, + TS_FIELD_NAME +) from zipline.pipeline.data.dataset import DataSet, Column from zipline.pipeline.loaders.utils import ( check_data_query_args, @@ -179,10 +183,6 @@ from zipline.utils.pandas_utils import sort_values from zipline.utils.preprocess import preprocess -AD_FIELD_NAME = 'asof_date' -TS_FIELD_NAME = 'timestamp' -SID_FIELD_NAME = 'sid' -ANNOUNCEMENT_FIELD_NAME = 'announcement_date' valid_deltas_node_types = ( bz.expr.Field, bz.expr.ReLabel, diff --git a/zipline/pipeline/loaders/blaze/earnings.py b/zipline/pipeline/loaders/blaze/earnings.py index 06d16738..80b963d0 100644 --- a/zipline/pipeline/loaders/blaze/earnings.py +++ b/zipline/pipeline/loaders/blaze/earnings.py @@ -1,7 +1,7 @@ -from .core import ( +from zipline.pipeline.common import ( ANNOUNCEMENT_FIELD_NAME, + SID_FIELD_NAME, TS_FIELD_NAME, - SID_FIELD_NAME ) from zipline.pipeline.data import EarningsCalendar from zipline.pipeline.loaders.earnings import EarningsCalendarLoader @@ -26,8 +26,6 @@ class BlazeEarningsCalendarLoader(BlazeEventsLoader): The timezeone to use for the data query cutoff. dataset: DataSet The DataSet object for which this loader loads data. - concrete_loader: EventsLoader - The reference loader to use for this dataset. Notes ----- diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index 70850251..3d334ce6 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -3,12 +3,14 @@ import abc from datashape import istabular from .core import ( - TS_FIELD_NAME, - SID_FIELD_NAME, bind_expression_to_resources, ffill_query_in_range, ) from zipline.pipeline.loaders.base import PipelineLoader +from zipline.pipeline.common import ( + SID_FIELD_NAME, + TS_FIELD_NAME, +) from zipline.pipeline.loaders.utils import ( check_data_query_args, normalize_data_query_bounds, @@ -36,6 +38,7 @@ class BlazeEventsLoader(PipelineLoader): The timezeone to use for the data query cutoff. dataset : DataSet The DataSet object for which this loader loads data. + Notes ----- The expression should have a tabular dshape of:: @@ -47,7 +50,7 @@ class BlazeEventsLoader(PipelineLoader): And other dataset-specific fields, where each row of the table is a record including the sid to identify the company, the timestamp where we - learned about the announcement, and the date when the earnings will be + learned about the announcement, and the date when the earnings will be z announced. If the '{TS_FIELD_NAME}' field is not included it is assumed that we @@ -82,7 +85,7 @@ class BlazeEventsLoader(PipelineLoader): @abc.abstractproperty def concrete_loader(self): - pass + NotImplementedError('concrete_loader') def load_adjusted_array(self, columns, dates, assets, mask): data_query_time = self._data_query_time diff --git a/zipline/pipeline/loaders/buyback_auth.py b/zipline/pipeline/loaders/buyback_auth.py index eb14689e..677732ec 100644 --- a/zipline/pipeline/loaders/buyback_auth.py +++ b/zipline/pipeline/loaders/buyback_auth.py @@ -1,5 +1,5 @@ """ -Reference implementation for EarningsCalendar loaders. +Reference implementation for buyback auth loaders. """ from ..data.buyback_auth import ( @@ -7,18 +7,18 @@ from ..data.buyback_auth import ( ShareBuybackAuthorizations ) from .events import EventsLoader +from zipline.pipeline.common import ( + BUYBACK_ANNOUNCEMENT_FIELD_NAME, + CASH_FIELD_NAME, + SHARE_COUNT_FIELD_NAME +) from zipline.utils.memoize import lazyval -BUYBACK_ANNOUNCEMENT_FIELD_NAME = 'buyback_date' -SHARE_COUNT_FIELD_NAME = 'share_count' -CASH_FIELD_NAME = 'cash' - - class CashBuybackAuthorizationsLoader(EventsLoader): """ Reference loader for - :class:`zipline.pipeline.data.earnings.CashBuybackAuthorizations`. + :class:`zipline.pipeline.data.CashBuybackAuthorizations`. events_by_sid: dict[sid -> pd.DataFrame(knowledge date, event date, cash value)] @@ -60,13 +60,13 @@ class CashBuybackAuthorizationsLoader(EventsLoader): class ShareBuybackAuthorizationsLoader(EventsLoader): """ Reference loader for - :class:`zipline.pipeline.data.earnings.ShareBuybackAuthorizations`. + :class:`zipline.pipeline.data.ShareBuybackAuthorizations`. Does not currently support adjustments to the dates of known buyback authorizations. events_by_sid: dict[sid -> pd.DataFrame(knowledge date, - event date, share value)] + event date, share value)] """ diff --git a/zipline/pipeline/loaders/earnings.py b/zipline/pipeline/loaders/earnings.py index 74d888f2..f3225790 100644 --- a/zipline/pipeline/loaders/earnings.py +++ b/zipline/pipeline/loaders/earnings.py @@ -4,10 +4,9 @@ Reference implementation for EarningsCalendar loaders. from ..data.earnings import EarningsCalendar from .events import EventsLoader +from zipline.pipeline.common import ANNOUNCEMENT_FIELD_NAME from zipline.utils.memoize import lazyval -ANNOUNCEMENT_FIELD_NAME = "announcement_date" - class EarningsCalendarLoader(EventsLoader): diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index 9c18611a..dd20b7c5 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -6,27 +6,27 @@ from toolz import merge from .base import PipelineLoader from .frame import DataFrameLoader from .utils import next_date_frame, previous_date_frame, previous_value +from zipline.pipeline.common import SID_FIELD_NAME, TS_FIELD_NAME -WRONG_COLS_ERROR = "Expected columns %s for sid %s but got columns %s." +WRONG_COLS_ERROR = "Expected columns {expected_columns} for sid {sid} but " \ + "got columns {resulting_columns}." -BAD_DATA_FORMAT_ERROR = ("Data for sid %s must be in DataFrame, " +BAD_DATA_FORMAT_ERROR = ("Data for sid {sid} must be in DataFrame, " "Series, or DatetimeIndex.") -SERIES_NO_DTINDEX_ERROR = ("Got Series for sid %d, but index was not " +SERIES_NO_DTINDEX_ERROR = ("Got Series for sid {sid}, but index was not " "DatetimeIndex.") -DTINDEX_NOT_INFER_TS_ERROR = ("Got DatetimeIndex for sid %d.\n" +DTINDEX_NOT_INFER_TS_ERROR = ("Got DatetimeIndex for sid {sid}.\n" "Pass `infer_timestamps=True` to use the first " "date in `all_dates` as implicit timestamp.") -DF_NO_TS_NOT_INFER_TS_ERROR = ("Got DataFrame without a '%r' column for sid " - "%d.\nPass `infer_timestamps=True` to use the " +DF_NO_TS_NOT_INFER_TS_ERROR = ("Got DataFrame without a '{" + "timestamp_column_name}' column for sid {sid}." + "\nPass `infer_timestamps=True` to use the " "first date in `all_dates` as implicit " "timestamp.") -TS_FIELD_NAME = "timestamp" -SID_FIELD_NAME = "sid" - class EventsLoader(PipelineLoader): """ @@ -38,8 +38,7 @@ class EventsLoader(PipelineLoader): ---------- all_dates : pd.DatetimeIndex Index of dates for which we can serve queries. - events_by_sid : dict[int -> pd.DataFrame], dict[int -> pd.Series], - or dict[int -> pd.DatetimeIndex] + events_by_sid : dict[int -> pd.DataFrame or pd.Series or pd.DatetimeIndex] Dict mapping sids to objects representing dates on which earnings occurred. @@ -65,13 +64,12 @@ class EventsLoader(PipelineLoader): Whether to allow omitting the "timestamp" column. dataset : DataSet The DataSet object for which this loader loads data. - expected_cols : frozenset - Set of expected columns for the dataset, without timestamp. + """ @abc.abstractproperty def expected_cols(self): - pass + raise NotImplemented('expected_cols') def __init__(self, all_dates, @@ -91,13 +89,13 @@ class EventsLoader(PipelineLoader): if isinstance(v, pd.Series): if not isinstance(v.index, pd.DatetimeIndex): raise ValueError( - SERIES_NO_DTINDEX_ERROR % k + SERIES_NO_DTINDEX_ERROR.format(sid=k) ) self.events_by_sid[k] = v = pd.DataFrame(v) elif isinstance(v, pd.DatetimeIndex): if not infer_timestamps: raise ValueError( - DTINDEX_NOT_INFER_TS_ERROR % k + DTINDEX_NOT_INFER_TS_ERROR.format(sid=k) ) self.events_by_sid[k] = v = pd.DataFrame( v, index=[dates[0]] * len(v) @@ -107,15 +105,17 @@ class EventsLoader(PipelineLoader): if TS_FIELD_NAME not in v.columns: if not infer_timestamps: raise ValueError( - DF_NO_TS_NOT_INFER_TS_ERROR % - (TS_FIELD_NAME, k) + DF_NO_TS_NOT_INFER_TS_ERROR.format( + timestamp_column_name=TS_FIELD_NAME, + sid=k + ) ) self.events_by_sid[k] = v = v.copy() v.index = [dates[0]] * len(v) else: self.events_by_sid[k] = v.set_index(TS_FIELD_NAME) else: - raise ValueError(BAD_DATA_FORMAT_ERROR % k) + raise ValueError(BAD_DATA_FORMAT_ERROR.format(sid=k)) # Once data is in a DF, make sure columns are correct. cols_except_ts = (set(v.columns) - {TS_FIELD_NAME} - @@ -123,8 +123,11 @@ class EventsLoader(PipelineLoader): # Check that all columns other than timestamp are as expected. if cols_except_ts != self.expected_cols: raise ValueError( - WRONG_COLS_ERROR % - (self.expected_cols, k, v.columns.values) + WRONG_COLS_ERROR .format( + expected_columns=self.expected_cols, + sid=k, + resulting_columns=v.columns.values + ) ) self.dataset = dataset From 5b37af6e04506402e391892a5ed41ae0bdcf40c7 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Mon, 22 Feb 2016 14:09:39 -0500 Subject: [PATCH 11/12] TST: enhance test; add more common data. STY: fixes for PEP8. --- tests/pipeline/test_buyback_auth.py | 37 ++++---- tests/pipeline/test_earnings.py | 66 +++++++------- tests/pipeline/test_events.py | 130 ++++++++++++++++++++-------- 3 files changed, 143 insertions(+), 90 deletions(-) diff --git a/tests/pipeline/test_buyback_auth.py b/tests/pipeline/test_buyback_auth.py index 10908f87..6d4ee10b 100644 --- a/tests/pipeline/test_buyback_auth.py +++ b/tests/pipeline/test_buyback_auth.py @@ -8,13 +8,10 @@ import blaze as bz from blaze.compute.core import swap_resources_into_scope from contextlib2 import ExitStack from nose_parameterized import parameterized -import numpy as np import pandas as pd -from pandas.util.testing import assert_series_equal from six import iteritems from tests.pipeline.test_events import param_dates, EventLoaderCommonTest -from zipline.pipeline import Pipeline from zipline.pipeline.common import( BUYBACK_ANNOUNCEMENT_FIELD_NAME, CASH_FIELD_NAME, @@ -27,7 +24,6 @@ from zipline.pipeline.common import( TS_FIELD_NAME) from zipline.pipeline.data import (CashBuybackAuthorizations, ShareBuybackAuthorizations) -from zipline.pipeline.engine import SimplePipelineEngine from zipline.pipeline.factors.events import ( BusinessDaysSincePreviousCashBuybackAuth, BusinessDaysSincePreviousShareBuybackAuth @@ -38,11 +34,7 @@ from zipline.pipeline.loaders.blaze import ( BlazeCashBuybackAuthorizationsLoader, BlazeShareBuybackAuthorizationsLoader, ) -from zipline.utils.numpy_utils import make_datetime64D, NaTD from zipline.utils.test_utils import ( - gen_calendars, - make_simple_equity_info, - num_days_in_range, tmp_asset_finder, ) @@ -97,7 +89,7 @@ class CashBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonTest): Test for cash buyback authorizations dataset. """ pipeline_columns = { - ('%s' % PREVIOUS_BUYBACK_CASH): + (PREVIOUS_BUYBACK_CASH): CashBuybackAuthorizations.previous_value.latest, PREVIOUS_BUYBACK_ANNOUNCEMENT: CashBuybackAuthorizations.previous_announcement_date.latest, @@ -124,8 +116,6 @@ class CashBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonTest): zip_with_floats_dates = partial(self.zip_with_floats, dates) num_days_between_dates = partial(self.num_days_between, dates) _expected_previous_cash = pd.DataFrame({ - # TODO if the next knowledge date is 10, why is the range - # until 15? 0: zip_with_floats_dates( ['NaN'] * num_days_between_dates(None, '2014-01-14') + [10] * num_days_between_dates('2014-01-15', '2014-01-19') + @@ -148,14 +138,16 @@ class CashBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonTest): ), 4: zip_with_floats_dates(['NaN'] * len(dates)), }, index=dates) - self.cols[PREVIOUS_BUYBACK_ANNOUNCEMENT] = self.get_expected_previous( - dates) + self.cols[PREVIOUS_BUYBACK_ANNOUNCEMENT] = \ + self.get_expected_previous_event_dates(dates) self.cols[PREVIOUS_BUYBACK_CASH] = _expected_previous_cash - + self.cols[DAYS_SINCE_PREV] = self._compute_busday_offsets( + self.cols[PREVIOUS_BUYBACK_ANNOUNCEMENT] + ) @parameterized.expand(param_dates) def test_compute_cash_buyback_auth(self, dates): - self._test_compute_buyback_auth(dates) + self._test_compute(dates) class ShareBuybackAuthLoaderTestCase(EventLoaderCommonTest, TestCase): @@ -163,9 +155,9 @@ class ShareBuybackAuthLoaderTestCase(EventLoaderCommonTest, TestCase): Test for share buyback authorizations dataset. """ pipeline_columns = { - ('%s' % PREVIOUS_BUYBACK_SHARE_COUNT): + PREVIOUS_BUYBACK_SHARE_COUNT: ShareBuybackAuthorizations.previous_share_count.latest, - ('%s' % PREVIOUS_BUYBACK_ANNOUNCEMENT): + PREVIOUS_BUYBACK_ANNOUNCEMENT: ShareBuybackAuthorizations.previous_announcement_date.latest, DAYS_SINCE_PREV: BusinessDaysSincePreviousShareBuybackAuth(), @@ -179,8 +171,8 @@ class ShareBuybackAuthLoaderTestCase(EventLoaderCommonTest, TestCase): ) cls.cols = {} cls.dataset = {sid: df.drop(CASH_FIELD_NAME, 1) - for sid, df in - enumerate(buyback_authorizations)} + for sid, df in + enumerate(buyback_authorizations)} cls.loader_type = ShareBuybackAuthorizationsLoader @classmethod @@ -217,11 +209,14 @@ class ShareBuybackAuthLoaderTestCase(EventLoaderCommonTest, TestCase): PREVIOUS_BUYBACK_SHARE_COUNT ] = _expected_previous_buyback_share_count self.cols[PREVIOUS_BUYBACK_ANNOUNCEMENT] = \ - self.get_expected_previous(dates) + self.get_expected_previous_event_dates(dates) + self.cols[DAYS_SINCE_PREV] = self._compute_busday_offsets( + self.cols[PREVIOUS_BUYBACK_ANNOUNCEMENT] + ) @parameterized.expand(param_dates) def test_compute_share_buyback_auth(self, dates): - self._test_compute_buyback_auth(dates) + self._test_compute(dates) class BlazeCashBuybackAuthLoaderTestCase(CashBuybackAuthLoaderTestCase): diff --git a/tests/pipeline/test_earnings.py b/tests/pipeline/test_earnings.py index f5242267..a4bce18b 100644 --- a/tests/pipeline/test_earnings.py +++ b/tests/pipeline/test_earnings.py @@ -35,35 +35,35 @@ from zipline.utils.test_utils import ( ) earnings_dates = [ - # K1--K2--E1--E2. - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-15', - '2014-01-20']) - }), - # K1--K2--E2--E1. - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-20', - '2014-01-15']) - }), - # K1--E1--K2--E2. - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-15']), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', - '2014-01-20']) - }), - # K1 == K2. - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05'] * 2), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', - '2014-01-15']) - }), - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime([]), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([]) - }) - ] + # K1--K2--E1--E2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-15', + '2014-01-20']) + }), + # K1--K2--E2--E1. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-20', + '2014-01-15']) + }), + # K1--E1--K2--E2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-15']), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', + '2014-01-20']) + }), + # K1 == K2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05'] * 2), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', + '2014-01-15']) + }), + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime([]), + ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([]) + }) +] class EarningsCalendarLoaderTestCase(TestCase, EventLoaderCommonTest): @@ -97,11 +97,12 @@ class EarningsCalendarLoaderTestCase(TestCase, EventLoaderCommonTest): def tearDownClass(cls): cls._cleanup_stack.close() - def setup(self, dates): _expected_next_announce = self.get_expected_next_event_dates(dates) - _expected_previous_announce = self.get_expected_previous_event_dates(dates) + _expected_previous_announce = self.get_expected_previous_event_dates( + dates + ) _expected_next_busday_offsets = self._compute_busday_offsets( _expected_next_announce @@ -146,7 +147,8 @@ class BlazeEarningsCalendarLoaderNotInteractiveTestCase( """ @classmethod def setUpClass(cls): - super(BlazeEarningsCalendarLoaderNotInteractiveTestCase, cls).setUpClass() + super(BlazeEarningsCalendarLoaderNotInteractiveTestCase, + cls).setUpClass() cls.loader_type = BlazeEarningsCalendarLoader def loader_args(self, dates): diff --git a/tests/pipeline/test_events.py b/tests/pipeline/test_events.py index 33c328ec..6c94b902 100644 --- a/tests/pipeline/test_events.py +++ b/tests/pipeline/test_events.py @@ -26,7 +26,9 @@ from zipline.pipeline.loaders.events import ( WRONG_COLS_ERROR, ) from zipline.utils.memoize import lazyval -from zipline.utils.numpy_utils import datetime64ns_dtype, NaTD, make_datetime64D +from zipline.utils.numpy_utils import (datetime64ns_dtype, + NaTD, + make_datetime64D) from zipline.utils.test_utils import gen_calendars, num_days_in_range, \ make_simple_equity_info @@ -153,25 +155,42 @@ class EventLoaderTestCase(TestCase): expected, ) - @parameterized.expand([ - # DataFrame without timestamp column and infer_timestamps = True - [pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx}), - False, - DF_NO_TS_NOT_INFER_TS_ERROR.format( - timestamp_column_name=TS_FIELD_NAME, - sid=0 - ) - ], - # DatetimeIndex with infer_timestamps = False - [pd.DatetimeIndex(dtx, name=ANNOUNCEMENT_FIELD_NAME), False, - DTINDEX_NOT_INFER_TS_ERROR.format(sid=0)], - # Series with DatetimeIndex as index and infer_timestamps = False - [pd.Series(dtx, name=ANNOUNCEMENT_FIELD_NAME), False, - SERIES_NO_DTINDEX_ERROR.format(sid=0)], - # Some other data structure that is not expected - [dtx, False, BAD_DATA_FORMAT_ERROR.format(sid=0)], - [dtx, True, BAD_DATA_FORMAT_ERROR.format(sid=0)] - ]) + @parameterized.expand( + [ + # DataFrame without timestamp column and infer_timestamps = True + [ + pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx}), + False, + DF_NO_TS_NOT_INFER_TS_ERROR.format( + timestamp_column_name=TS_FIELD_NAME, + sid=0 + ) + ], + # DatetimeIndex with infer_timestamps = False + [ + pd.DatetimeIndex(dtx, name=ANNOUNCEMENT_FIELD_NAME), + False, + DTINDEX_NOT_INFER_TS_ERROR.format(sid=0) + ], + # Series with DatetimeIndex as index and infer_timestamps = False + [ + pd.Series(dtx, name=ANNOUNCEMENT_FIELD_NAME), + False, + SERIES_NO_DTINDEX_ERROR.format(sid=0) + ], + # Some other data structure that is not expected + [ + dtx, + False, + BAD_DATA_FORMAT_ERROR.format(sid=0) + ], + [ + dtx, + True, + BAD_DATA_FORMAT_ERROR.format(sid=0) + ] + ] + ) def test_bad_conversion_to_df(self, df, infer_timestamps, msg): events_by_sid = {0: df} assert_loader_error(events_by_sid, ValueError, msg, @@ -204,14 +223,6 @@ class BlazeEventLoaderTestCase(TestCase): TestCase.assertTrue(ABSTRACT_METHODS_ERROR in context.exception) - - - - - -########################## - - # Must be a list - can't use generator since this needs to be used more than # once. param_dates = list(gen_calendars( @@ -237,15 +248,12 @@ class EventLoaderCommonTest(object): def zip_with_floats(self, dates, flts): return pd.Series(flts, index=dates).astype('float') - def num_days_between(self, dates, start_date, end_date): return num_days_in_range(dates, start_date, end_date) - def zip_with_dates(self, index_dates, dts): return pd.Series(pd.to_datetime(dts), index=index_dates) - def loader_args(self, dates): """Construct the base object to pass to the loader. @@ -268,7 +276,59 @@ class EventLoaderCommonTest(object): loader = self.loader_type(*self.loader_args(dates)) return SimplePipelineEngine(lambda _: loader, dates, self.finder) - def get_expected_previous(self, dates): + def get_expected_next_event_dates(self, dates): + num_days_between_for_dates = partial(self.num_days_between, dates) + zip_with_dates_for_dates = partial(self.zip_with_dates, dates) + return pd.DataFrame({ + 0: zip_with_dates_for_dates( + ['NaT'] * + num_days_between_for_dates(None, '2014-01-04') + + ['2014-01-15'] * + num_days_between_for_dates('2014-01-05', '2014-01-15') + + ['2014-01-20'] * + num_days_between_for_dates('2014-01-16', '2014-01-20') + + ['NaT'] * + num_days_between_for_dates('2014-01-21', None) + ), + 1: zip_with_dates_for_dates( + ['NaT'] * + num_days_between_for_dates(None, '2014-01-04') + + ['2014-01-20'] * + num_days_between_for_dates('2014-01-05', '2014-01-09') + + ['2014-01-15'] * + num_days_between_for_dates('2014-01-10', '2014-01-15') + + ['2014-01-20'] * + num_days_between_for_dates('2014-01-16', '2014-01-20') + + ['NaT'] * + num_days_between_for_dates('2014-01-21', None) + ), + 2: zip_with_dates_for_dates( + ['NaT'] * + num_days_between_for_dates(None, '2014-01-04') + + ['2014-01-10'] * + num_days_between_for_dates('2014-01-05', '2014-01-10') + + ['NaT'] * + num_days_between_for_dates('2014-01-11', '2014-01-14') + + ['2014-01-20'] * + num_days_between_for_dates('2014-01-15', '2014-01-20') + + ['NaT'] * + num_days_between_for_dates('2014-01-21', None) + ), + 3: zip_with_dates_for_dates( + ['NaT'] * + num_days_between_for_dates(None, '2014-01-04') + + ['2014-01-10'] * + num_days_between_for_dates('2014-01-05', '2014-01-10') + + ['2014-01-15'] * + num_days_between_for_dates('2014-01-11', '2014-01-15') + + ['NaT'] * + num_days_between_for_dates('2014-01-16', None) + ), + 4: zip_with_dates_for_dates(['NaT'] * + len(dates)), + }, index=dates) + + def get_expected_previous_event_dates(self, dates): num_days_between_for_dates = partial(self.num_days_between, dates) zip_with_dates_for_dates = partial(self.zip_with_dates, dates) return pd.DataFrame({ @@ -339,7 +399,7 @@ class EventLoaderCommonTest(object): index=announcement_dates.index, ) - def _test_compute_buyback_auth(self, dates): + def _test_compute(self, dates): engine = self.setup_engine(dates) self.setup(dates) @@ -358,7 +418,3 @@ class EventLoaderCommonTest(object): assert_series_equal(result[col_name].xs(sid, level=1), self.cols[col_name][sid], check_names=False) - - - - From 18d838be87ae68406db6f79f57c66f4921ef8502 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Thu, 25 Feb 2016 13:52:16 -0500 Subject: [PATCH 12/12] MAINT: modify coersion logic. MAINT: make expected_cols class attributes. MAINT: make concrete_loader a class attribute. MAINT: drop sid column before creating loader. TST: add method to fill df with event cases. TST: move common case df into test_events.py. TST: improve tests and fix error message assertions. STY: fix whitespace. DOC: update docs. STY: fix style. MAINT: clean up STY: fix indentation. MAINT: use different assertion method to check error message for python3.4 compatibility. --- tests/pipeline/test_buyback_auth.py | 63 +++---- tests/pipeline/test_earnings.py | 45 +---- tests/pipeline/test_events.py | 156 +++++++++++++----- .../pipeline/loaders/blaze/buyback_auth.py | 24 +-- zipline/pipeline/loaders/blaze/earnings.py | 6 +- zipline/pipeline/loaders/blaze/events.py | 3 +- zipline/pipeline/loaders/buyback_auth.py | 13 +- zipline/pipeline/loaders/earnings.py | 6 +- zipline/pipeline/loaders/events.py | 83 ++++++---- 9 files changed, 212 insertions(+), 187 deletions(-) diff --git a/tests/pipeline/test_buyback_auth.py b/tests/pipeline/test_buyback_auth.py index 6d4ee10b..fbdd9797 100644 --- a/tests/pipeline/test_buyback_auth.py +++ b/tests/pipeline/test_buyback_auth.py @@ -7,10 +7,9 @@ from unittest import TestCase import blaze as bz from blaze.compute.core import swap_resources_into_scope from contextlib2 import ExitStack -from nose_parameterized import parameterized import pandas as pd from six import iteritems -from tests.pipeline.test_events import param_dates, EventLoaderCommonTest +from tests.pipeline.test_events import EventLoaderCommonMixin, DATE_FIELD_NAME from zipline.pipeline.common import( BUYBACK_ANNOUNCEMENT_FIELD_NAME, @@ -42,54 +41,53 @@ from zipline.utils.test_utils import ( buyback_authorizations = [ # K1--K2--A1--A2. pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), - BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-15', - '2014-01-20']), SHARE_COUNT_FIELD_NAME: [1, 15], CASH_FIELD_NAME: [10, 20] }), # K1--K2--A2--A1. pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), - BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ - '2014-01-20', '2014-01-15' - ]), - SHARE_COUNT_FIELD_NAME: [7, 13], CASH_FIELD_NAME: [10, 22] + SHARE_COUNT_FIELD_NAME: [7, 13], + CASH_FIELD_NAME: [10, 22] }), # K1--A1--K2--A2. pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-15']), - BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ - '2014-01-10', '2014-01-20' - ]), SHARE_COUNT_FIELD_NAME: [3, 1], CASH_FIELD_NAME: [4, 7] }), # K1 == K2. pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05'] * 2), - BUYBACK_ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([ - '2014-01-10', '2014-01-15' - ]), SHARE_COUNT_FIELD_NAME: [6, 23], CASH_FIELD_NAME: [1, 2] }), pd.DataFrame( - columns=[TS_FIELD_NAME, - BUYBACK_ANNOUNCEMENT_FIELD_NAME, - SHARE_COUNT_FIELD_NAME, + columns=[SHARE_COUNT_FIELD_NAME, CASH_FIELD_NAME], dtype='datetime64[ns]' ), ] -class CashBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonTest): +def create_buyback_auth_tst_frame(cases, field_to_drop): + buyback_auth_df = { + sid: + pd.concat([df, buyback_authorizations[sid]], axis=1).drop( + field_to_drop, 1) + for sid, df + in enumerate(case.rename(columns={DATE_FIELD_NAME: + BUYBACK_ANNOUNCEMENT_FIELD_NAME} + ) + for case in cases + ) + } + return buyback_auth_df + + +class CashBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonMixin): """ Test for cash buyback authorizations dataset. """ pipeline_columns = { - (PREVIOUS_BUYBACK_CASH): + PREVIOUS_BUYBACK_CASH: CashBuybackAuthorizations.previous_value.latest, PREVIOUS_BUYBACK_ANNOUNCEMENT: CashBuybackAuthorizations.previous_announcement_date.latest, @@ -104,8 +102,8 @@ class CashBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonTest): tmp_asset_finder(equities=cls.equity_info), ) cls.cols = {} - cls.dataset = {sid: df.drop(SHARE_COUNT_FIELD_NAME, 1) - for sid, df in enumerate(buyback_authorizations)} + cls.dataset = create_buyback_auth_tst_frame(cls.event_dates_cases, + SHARE_COUNT_FIELD_NAME) cls.loader_type = CashBuybackAuthorizationsLoader @classmethod @@ -145,12 +143,8 @@ class CashBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonTest): self.cols[PREVIOUS_BUYBACK_ANNOUNCEMENT] ) - @parameterized.expand(param_dates) - def test_compute_cash_buyback_auth(self, dates): - self._test_compute(dates) - -class ShareBuybackAuthLoaderTestCase(EventLoaderCommonTest, TestCase): +class ShareBuybackAuthLoaderTestCase(TestCase, EventLoaderCommonMixin): """ Test for share buyback authorizations dataset. """ @@ -170,9 +164,8 @@ class ShareBuybackAuthLoaderTestCase(EventLoaderCommonTest, TestCase): tmp_asset_finder(equities=cls.equity_info), ) cls.cols = {} - cls.dataset = {sid: df.drop(CASH_FIELD_NAME, 1) - for sid, df in - enumerate(buyback_authorizations)} + cls.dataset = create_buyback_auth_tst_frame(cls.event_dates_cases, + CASH_FIELD_NAME) cls.loader_type = ShareBuybackAuthorizationsLoader @classmethod @@ -214,10 +207,6 @@ class ShareBuybackAuthLoaderTestCase(EventLoaderCommonTest, TestCase): self.cols[PREVIOUS_BUYBACK_ANNOUNCEMENT] ) - @parameterized.expand(param_dates) - def test_compute_share_buyback_auth(self, dates): - self._test_compute(dates) - class BlazeCashBuybackAuthLoaderTestCase(CashBuybackAuthLoaderTestCase): """ Test case for loading via blaze. diff --git a/tests/pipeline/test_earnings.py b/tests/pipeline/test_earnings.py index a4bce18b..ce39220d 100644 --- a/tests/pipeline/test_earnings.py +++ b/tests/pipeline/test_earnings.py @@ -6,10 +6,9 @@ from unittest import TestCase import blaze as bz from blaze.compute.core import swap_resources_into_scope from contextlib2 import ExitStack -from nose_parameterized import parameterized import pandas as pd from six import iteritems -from tests.pipeline.test_events import EventLoaderCommonTest, param_dates +from tests.pipeline.test_events import EventLoaderCommonMixin, DATE_FIELD_NAME from zipline.pipeline.common import ( ANNOUNCEMENT_FIELD_NAME, @@ -34,39 +33,8 @@ from zipline.utils.test_utils import ( tmp_asset_finder, ) -earnings_dates = [ - # K1--K2--E1--E2. - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-15', - '2014-01-20']) - }), - # K1--K2--E2--E1. - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-20', - '2014-01-15']) - }), - # K1--E1--K2--E2. - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-15']), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', - '2014-01-20']) - }), - # K1 == K2. - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime(['2014-01-05'] * 2), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime(['2014-01-10', - '2014-01-15']) - }), - pd.DataFrame({ - TS_FIELD_NAME: pd.to_datetime([]), - ANNOUNCEMENT_FIELD_NAME: pd.to_datetime([]) - }) -] - -class EarningsCalendarLoaderTestCase(TestCase, EventLoaderCommonTest): +class EarningsCalendarLoaderTestCase(TestCase, EventLoaderCommonMixin): """ Tests for loading the earnings announcement data. """ @@ -86,7 +54,10 @@ class EarningsCalendarLoaderTestCase(TestCase, EventLoaderCommonTest): end_date=pd.Timestamp('2015-01-01', tz='UTC'), ) cls.cols = {} - cls.dataset = {sid: df for sid, df in enumerate(earnings_dates)} + cls.dataset = {sid: df for sid, df in enumerate( + case.rename( + columns={DATE_FIELD_NAME: ANNOUNCEMENT_FIELD_NAME} + ) for case in cls.event_dates_cases)} cls.finder = stack.enter_context( tmp_asset_finder(equities=equity_info), ) @@ -115,10 +86,6 @@ class EarningsCalendarLoaderTestCase(TestCase, EventLoaderCommonTest): self.cols[DAYS_TO_NEXT] = _expected_next_busday_offsets self.cols[DAYS_SINCE_PREV] = _expected_previous_busday_offsets - @parameterized.expand(param_dates) - def test_compute_earnings(self, dates): - self._test_compute(dates) - class BlazeEarningsCalendarLoaderTestCase(EarningsCalendarLoaderTestCase): @classmethod diff --git a/tests/pipeline/test_events.py b/tests/pipeline/test_events.py index 6c94b902..48cd4d07 100644 --- a/tests/pipeline/test_events.py +++ b/tests/pipeline/test_events.py @@ -3,11 +3,13 @@ Tests for setting up an EventsLoader and a BlazeEventsLoader. """ from functools import partial from nose_parameterized import parameterized +import re +from unittest import TestCase import blaze as bz import numpy as np import pandas as pd -from pandas.util.testing import assert_series_equal, TestCase +from pandas.util.testing import assert_series_equal from zipline.pipeline import SimplePipelineEngine, Pipeline from zipline.pipeline.common import ( @@ -18,21 +20,29 @@ from zipline.pipeline.common import ( from zipline.pipeline.data import DataSet, Column from zipline.pipeline.loaders.blaze.events import BlazeEventsLoader from zipline.pipeline.loaders.events import ( - BAD_DATA_FORMAT_ERROR, DF_NO_TS_NOT_INFER_TS_ERROR, DTINDEX_NOT_INFER_TS_ERROR, EventsLoader, SERIES_NO_DTINDEX_ERROR, WRONG_COLS_ERROR, + WRONG_MANY_COL_DATA_FORMAT_ERROR, + WRONG_SINGLE_COL_DATA_FORMAT_ERROR ) from zipline.utils.memoize import lazyval -from zipline.utils.numpy_utils import (datetime64ns_dtype, - NaTD, - make_datetime64D) -from zipline.utils.test_utils import gen_calendars, num_days_in_range, \ +from zipline.utils.numpy_utils import ( + datetime64ns_dtype, + NaTD, + make_datetime64D +) +from zipline.utils.test_utils import ( + gen_calendars, + num_days_in_range, make_simple_equity_info +) -ABSTRACT_METHODS_ERROR = 'abstract methods concrete_loader' +ABSTRACT_CONCRETE_LOADER_ERROR = 'abstract methods concrete_loader' +ABSTRACT_EXPECTED_COLS_ERROR = 'abstract methods expected_cols' +DATE_FIELD_NAME = "event_date" class EventDataSet(DataSet): @@ -40,6 +50,7 @@ class EventDataSet(DataSet): class EventDataSetLoader(EventsLoader): + expected_cols = frozenset([ANNOUNCEMENT_FIELD_NAME]) def __init__(self, all_dates, @@ -53,10 +64,6 @@ class EventDataSetLoader(EventsLoader): dataset=dataset, ) - @property - def expected_cols(self): - return frozenset([ANNOUNCEMENT_FIELD_NAME]) - @lazyval def previous_announcement_loader(self): return self._previous_event_date_loader( @@ -72,6 +79,12 @@ class EventDataSetLoader(EventsLoader): ) +# Test case just for catching an error when multiple columns are in the wrong +# data format, so no loader defined. +class EventDataSetLoaderMultipleExpectedCols(EventsLoader): + expected_cols = frozenset([ANNOUNCEMENT_FIELD_NAME, "other_field"]) + + class EventDataSetLoaderNoExpectedCols(EventsLoader): def __init__(self, @@ -90,32 +103,32 @@ class EventDataSetLoaderNoExpectedCols(EventsLoader): dtx = pd.date_range('2014-01-01', '2014-01-10') -def assert_loader_error(events_by_sid, error, msg, infer_timestamps): - with TestCase.assertRaises(error) as context: - EventDataSetLoader( - dtx, events_by_sid, infer_timestamps=infer_timestamps, - ) - TestCase.assertTrue(msg in context.exception) - - class EventLoaderTestCase(TestCase): + def assert_loader_error(self, events_by_sid, error, msg, + infer_timestamps, loader): + with self.assertRaisesRegexp(error, re.escape(msg)): + loader( + dtx, events_by_sid, infer_timestamps=infer_timestamps, + ) def test_no_expected_cols_defined(self): events_by_sid = {0: pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx})} - assert_loader_error(events_by_sid, TypeError, ABSTRACT_METHODS_ERROR, - True) + self.assert_loader_error(events_by_sid, TypeError, + ABSTRACT_EXPECTED_COLS_ERROR, + True, EventDataSetLoaderNoExpectedCols) def test_wrong_cols(self): wrong_col_name = 'some_other_col' # Test wrong cols (cols != expected) events_by_sid = {0: pd.DataFrame({wrong_col_name: dtx})} - assert_loader_error( + self.assert_loader_error( events_by_sid, ValueError, WRONG_COLS_ERROR.format( - expected_columns=EventDataSetLoader.expected_cols, + expected_columns=list(EventDataSetLoader.expected_cols), sid=0, - resulting_columns=wrong_col_name, + resulting_columns=[wrong_col_name], ), - True + True, + EventDataSetLoader ) @parameterized.expand([ @@ -125,9 +138,9 @@ class EventLoaderTestCase(TestCase): [pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx, TS_FIELD_NAME: dtx}), False], # DatetimeIndex with infer_timestamps = True - [pd.DatetimeIndex(dtx, name=ANNOUNCEMENT_FIELD_NAME), True], + [pd.DatetimeIndex(dtx), True], # Series with DatetimeIndex as index and infer_timestamps = False - [pd.Series(dtx, index=dtx, name=ANNOUNCEMENT_FIELD_NAME), False] + [pd.Series(dtx, index=dtx), False] ]) def test_conversion_to_df(self, df, infer_timestamps): @@ -164,37 +177,61 @@ class EventLoaderTestCase(TestCase): DF_NO_TS_NOT_INFER_TS_ERROR.format( timestamp_column_name=TS_FIELD_NAME, sid=0 - ) + ), + EventDataSetLoader ], # DatetimeIndex with infer_timestamps = False [ pd.DatetimeIndex(dtx, name=ANNOUNCEMENT_FIELD_NAME), False, - DTINDEX_NOT_INFER_TS_ERROR.format(sid=0) + DTINDEX_NOT_INFER_TS_ERROR.format(sid=0), + EventDataSetLoader ], # Series with DatetimeIndex as index and infer_timestamps = False [ pd.Series(dtx, name=ANNOUNCEMENT_FIELD_NAME), False, - SERIES_NO_DTINDEX_ERROR.format(sid=0) + SERIES_NO_DTINDEX_ERROR.format(sid=0), + EventDataSetLoader ], - # Some other data structure that is not expected + # Below, 2 cases repeated for infer_timestamps = True and False. + # Shouldn't make a difference in the outcome. + # We expected 1 column but got a data structure other than a + # DataFrame, Series, or DatetimeIndex [ - dtx, - False, - BAD_DATA_FORMAT_ERROR.format(sid=0) - ], - [ - dtx, + [dtx], True, - BAD_DATA_FORMAT_ERROR.format(sid=0) + WRONG_SINGLE_COL_DATA_FORMAT_ERROR.format(sid=0), + EventDataSetLoader + ], + # We expected multiple columns but got a data structure other + # than a DataFrame + [ + [dtx, dtx], + True, + WRONG_MANY_COL_DATA_FORMAT_ERROR.format(sid=0), + EventDataSetLoaderMultipleExpectedCols + ], + [ + [dtx], + False, + WRONG_SINGLE_COL_DATA_FORMAT_ERROR.format(sid=0), + EventDataSetLoader + ], + # We expected multiple columns but got a data structure other + # than a DataFrame + [ + [dtx, dtx], + False, + WRONG_MANY_COL_DATA_FORMAT_ERROR.format(sid=0), + EventDataSetLoaderMultipleExpectedCols ] ] ) - def test_bad_conversion_to_df(self, df, infer_timestamps, msg): + def test_bad_conversion_to_df(self, df, infer_timestamps, msg, loader): events_by_sid = {0: df} - assert_loader_error(events_by_sid, ValueError, msg, - infer_timestamps=infer_timestamps) + self.assert_loader_error(events_by_sid, ValueError, msg, + infer_timestamps, loader) class BlazeEventDataSetLoaderNoConcreteLoader(BlazeEventsLoader): @@ -212,7 +249,9 @@ class BlazeEventDataSetLoaderNoConcreteLoader(BlazeEventsLoader): class BlazeEventLoaderTestCase(TestCase): # Blaze loader: need to test failure if no concrete loader def test_no_concrete_loader_defined(self): - with TestCase.assertRaises(TypeError) as context: + with self.assertRaisesRegexp( + TypeError, re.escape(ABSTRACT_CONCRETE_LOADER_ERROR) + ): BlazeEventDataSetLoaderNoConcreteLoader( bz.Data( pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx, @@ -220,7 +259,6 @@ class BlazeEventLoaderTestCase(TestCase): }) ) ) - TestCase.assertTrue(ABSTRACT_METHODS_ERROR in context.exception) # Must be a list - can't use generator since this needs to be used more than @@ -237,7 +275,7 @@ param_dates = list(gen_calendars( )) -class EventLoaderCommonTest(object): +class EventLoaderCommonMixin(object): sids = A, B, C, D, E = range(5) equity_info = make_simple_equity_info( sids, @@ -245,6 +283,33 @@ class EventLoaderCommonTest(object): end_date=pd.Timestamp('2015-01-01', tz='UTC'), ) + event_dates_cases = [ + # K1--K2--E1--E2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), + DATE_FIELD_NAME: pd.to_datetime(['2014-01-15', '2014-01-20']) + }), + # K1--K2--E2--E1. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-10']), + DATE_FIELD_NAME: pd.to_datetime(['2014-01-20', '2014-01-15']) + }), + # K1--E1--K2--E2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05', '2014-01-15']), + DATE_FIELD_NAME: pd.to_datetime(['2014-01-10', '2014-01-20']) + }), + # K1 == K2. + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime(['2014-01-05'] * 2), + DATE_FIELD_NAME: pd.to_datetime(['2014-01-10', '2014-01-15']) + }), + pd.DataFrame({ + TS_FIELD_NAME: pd.to_datetime([]), + DATE_FIELD_NAME: pd.to_datetime([]) + }) + ] + def zip_with_floats(self, dates, flts): return pd.Series(flts, index=dates).astype('float') @@ -399,7 +464,8 @@ class EventLoaderCommonTest(object): index=announcement_dates.index, ) - def _test_compute(self, dates): + @parameterized.expand(param_dates) + def test_compute(self, dates): engine = self.setup_engine(dates) self.setup(dates) diff --git a/zipline/pipeline/loaders/blaze/buyback_auth.py b/zipline/pipeline/loaders/blaze/buyback_auth.py index 52305e30..cd83e1d1 100644 --- a/zipline/pipeline/loaders/blaze/buyback_auth.py +++ b/zipline/pipeline/loaders/blaze/buyback_auth.py @@ -17,8 +17,8 @@ from .events import BlazeEventsLoader class BlazeCashBuybackAuthorizationsLoader(BlazeEventsLoader): - """A pipeline loader for the ``CashBuybackAuthorizations`` dataset that loads - data from a blaze expression. + """A pipeline loader for the ``CashBuybackAuthorizations`` dataset that + loads data from a blaze expression. Parameters ---------- @@ -34,8 +34,6 @@ class BlazeCashBuybackAuthorizationsLoader(BlazeEventsLoader): The timezeone to use for the data query cutoff. dataset: DataSet The DataSet object for which this loader loads data. - concrete_loader: EventsLoader - The reference loader to use for this dataset. Notes ----- @@ -69,6 +67,8 @@ class BlazeCashBuybackAuthorizationsLoader(BlazeEventsLoader): CASH_FIELD_NAME }) + concrete_loader = CashBuybackAuthorizationsLoader + def __init__(self, expr, resources=None, @@ -87,14 +87,10 @@ class BlazeCashBuybackAuthorizationsLoader(BlazeEventsLoader): dataset=dataset, **kwargs) - @property - def concrete_loader(self): - return CashBuybackAuthorizationsLoader - class BlazeShareBuybackAuthorizationsLoader(BlazeEventsLoader): - """A pipeline loader for the ``ShareBuybackAuthorizations`` dataset that loads - data from a blaze expression. + """A pipeline loader for the ``ShareBuybackAuthorizations`` dataset that + loads data from a blaze expression. Parameters ---------- @@ -110,8 +106,6 @@ class BlazeShareBuybackAuthorizationsLoader(BlazeEventsLoader): The timezeone to use for the data query cutoff. dataset: DataSet The DataSet object for which this loader loads data. - concrete_loader: EventsLoader - The reference loader to use for this dataset. Notes ----- @@ -145,6 +139,8 @@ class BlazeShareBuybackAuthorizationsLoader(BlazeEventsLoader): SHARE_COUNT_FIELD_NAME, }) + concrete_loader = ShareBuybackAuthorizationsLoader + def __init__(self, expr, resources=None, @@ -162,7 +158,3 @@ class BlazeShareBuybackAuthorizationsLoader(BlazeEventsLoader): data_query_tz=data_query_tz, dataset=dataset, **kwargs) - - @property - def concrete_loader(self): - return ShareBuybackAuthorizationsLoader diff --git a/zipline/pipeline/loaders/blaze/earnings.py b/zipline/pipeline/loaders/blaze/earnings.py index 80b963d0..39724e76 100644 --- a/zipline/pipeline/loaders/blaze/earnings.py +++ b/zipline/pipeline/loaders/blaze/earnings.py @@ -57,6 +57,8 @@ class BlazeEarningsCalendarLoader(BlazeEventsLoader): ANNOUNCEMENT_FIELD_NAME, }) + concrete_loader = EarningsCalendarLoader + def __init__(self, expr, resources=None, @@ -71,7 +73,3 @@ class BlazeEarningsCalendarLoader(BlazeEventsLoader): resources=resources, odo_kwargs=odo_kwargs, data_query_time=data_query_time, data_query_tz=data_query_tz, **kwargs) - - @property - def concrete_loader(self): - return EarningsCalendarLoader diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index 3d334ce6..93ed6b72 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -124,4 +124,5 @@ class BlazeEventsLoader(PipelineLoader): ).load_adjusted_array(columns, dates, assets, mask) def prepare_data(self, raw, gb): - return {sid: raw.loc[group] for sid, group in gb.groups.items()} + return {sid: raw.loc[group].drop(SID_FIELD_NAME, axis=1) for sid, group + in gb.groups.items()} diff --git a/zipline/pipeline/loaders/buyback_auth.py b/zipline/pipeline/loaders/buyback_auth.py index 677732ec..d46a207a 100644 --- a/zipline/pipeline/loaders/buyback_auth.py +++ b/zipline/pipeline/loaders/buyback_auth.py @@ -24,6 +24,8 @@ class CashBuybackAuthorizationsLoader(EventsLoader): event date, cash value)] """ + expected_cols = frozenset([BUYBACK_ANNOUNCEMENT_FIELD_NAME, + CASH_FIELD_NAME]) def __init__(self, all_dates, @@ -37,10 +39,6 @@ class CashBuybackAuthorizationsLoader(EventsLoader): dataset=dataset, ) - @property - def expected_cols(self): - return frozenset([BUYBACK_ANNOUNCEMENT_FIELD_NAME, CASH_FIELD_NAME]) - @lazyval def previous_value_loader(self): return self._previous_event_value_loader( @@ -69,6 +67,8 @@ class ShareBuybackAuthorizationsLoader(EventsLoader): event date, share value)] """ + expected_cols = frozenset([BUYBACK_ANNOUNCEMENT_FIELD_NAME, + SHARE_COUNT_FIELD_NAME]) def __init__(self, all_dates, @@ -82,11 +82,6 @@ class ShareBuybackAuthorizationsLoader(EventsLoader): dataset=dataset, ) - @property - def expected_cols(self): - return frozenset([BUYBACK_ANNOUNCEMENT_FIELD_NAME, - SHARE_COUNT_FIELD_NAME]) - @lazyval def previous_share_count_loader(self): return self._previous_event_value_loader( diff --git a/zipline/pipeline/loaders/earnings.py b/zipline/pipeline/loaders/earnings.py index f3225790..89d376c9 100644 --- a/zipline/pipeline/loaders/earnings.py +++ b/zipline/pipeline/loaders/earnings.py @@ -10,6 +10,8 @@ from zipline.utils.memoize import lazyval class EarningsCalendarLoader(EventsLoader): + expected_cols = frozenset([ANNOUNCEMENT_FIELD_NAME]) + def __init__(self, all_dates, events_by_sid, infer_timestamps=False, dataset=EarningsCalendar): @@ -17,10 +19,6 @@ class EarningsCalendarLoader(EventsLoader): all_dates, events_by_sid, infer_timestamps, dataset=dataset, ) - @property - def expected_cols(self): - return frozenset([ANNOUNCEMENT_FIELD_NAME]) - @lazyval def next_announcement_loader(self): return self._next_event_date_loader(self.dataset.next_announcement, diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index dd20b7c5..84d0735c 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -6,13 +6,18 @@ from toolz import merge from .base import PipelineLoader from .frame import DataFrameLoader from .utils import next_date_frame, previous_date_frame, previous_value -from zipline.pipeline.common import SID_FIELD_NAME, TS_FIELD_NAME +from zipline.pipeline.common import TS_FIELD_NAME WRONG_COLS_ERROR = "Expected columns {expected_columns} for sid {sid} but " \ "got columns {resulting_columns}." -BAD_DATA_FORMAT_ERROR = ("Data for sid {sid} must be in DataFrame, " - "Series, or DatetimeIndex.") +WRONG_SINGLE_COL_DATA_FORMAT_ERROR = ("Data for sid {sid} is expected to have " + "1 column and to be in a DataFrame, " + "Series, or DatetimeIndex.") + +WRONG_MANY_COL_DATA_FORMAT_ERROR = ("Data for sid {sid} is expected to have " + "more than 1 column and to be in a " + "DataFrame.") SERIES_NO_DTINDEX_ERROR = ("Got Series for sid {sid}, but index was not " "DatetimeIndex.") @@ -85,23 +90,8 @@ class EventsLoader(PipelineLoader): dates = self.all_dates.values for k, v in iteritems(events_by_sid): - # First, must convert to DataFrame. - if isinstance(v, pd.Series): - if not isinstance(v.index, pd.DatetimeIndex): - raise ValueError( - SERIES_NO_DTINDEX_ERROR.format(sid=k) - ) - self.events_by_sid[k] = v = pd.DataFrame(v) - elif isinstance(v, pd.DatetimeIndex): - if not infer_timestamps: - raise ValueError( - DTINDEX_NOT_INFER_TS_ERROR.format(sid=k) - ) - self.events_by_sid[k] = v = pd.DataFrame( - v, index=[dates[0]] * len(v) - ) # Already a DataFrame - elif isinstance(v, pd.DataFrame): + if isinstance(v, pd.DataFrame): if TS_FIELD_NAME not in v.columns: if not infer_timestamps: raise ValueError( @@ -114,21 +104,50 @@ class EventsLoader(PipelineLoader): v.index = [dates[0]] * len(v) else: self.events_by_sid[k] = v.set_index(TS_FIELD_NAME) - else: - raise ValueError(BAD_DATA_FORMAT_ERROR.format(sid=k)) - # Once data is in a DF, make sure columns are correct. - cols_except_ts = (set(v.columns) - - {TS_FIELD_NAME} - - {SID_FIELD_NAME}) - # Check that all columns other than timestamp are as expected. - if cols_except_ts != self.expected_cols: - raise ValueError( - WRONG_COLS_ERROR .format( - expected_columns=self.expected_cols, - sid=k, - resulting_columns=v.columns.values + # Once data is in a DF, make sure columns are correct. + cols_except_ts = (set(v.columns) - + {TS_FIELD_NAME}) + + # Check that all columns other than timestamp are as expected. + if cols_except_ts != self.expected_cols: + raise ValueError( + WRONG_COLS_ERROR.format( + expected_columns=list(self.expected_cols), + sid=k, + resulting_columns=v.columns.values + ) ) + # Not a DataFrame and we only expect 1 column + elif len(self.expected_cols) == 1: + # First, must convert to DataFrame. + if isinstance(v, pd.Series): + if not isinstance(v.index, pd.DatetimeIndex): + raise ValueError( + SERIES_NO_DTINDEX_ERROR.format(sid=k) + ) + self.events_by_sid[k] = pd.DataFrame({ + list(self.expected_cols)[0]: v}) + elif isinstance(v, pd.DatetimeIndex): + if not infer_timestamps: + raise ValueError( + DTINDEX_NOT_INFER_TS_ERROR.format(sid=k) + ) + self.events_by_sid[k] = pd.DataFrame({ + list(self.expected_cols)[0]: v + }, index=[dates[0]] * len(v)) + else: + # We expect 1 column, but we got something other than a + # Series, DatetimeIndex, or DataFrame. + raise ValueError( + WRONG_SINGLE_COL_DATA_FORMAT_ERROR.format(sid=k) + ) + else: + # We expected multiple columns, but we got something other + # than a DataFrame. + raise ValueError( + WRONG_MANY_COL_DATA_FORMAT_ERROR.format(sid=k) ) + self.dataset = dataset def get_loader(self, column):