From 89412616a6da6570f36eeaf9365dddf5575c8204 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Tue, 19 Apr 2016 16:02:13 -0400 Subject: [PATCH 1/6] MAINT: filter rows with nulls in expected columns --- zipline/pipeline/loaders/events.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index e88d7966..074ee680 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -148,7 +148,8 @@ class EventsLoader(PipelineLoader): raise ValueError( WRONG_MANY_COL_DATA_FORMAT_ERROR.format(sid=k) ) - + self.events_by_sid = {sid: df.dropna(subset=self.expected_cols) for + sid, df in self.events_by_sid.iteritems()} self.dataset = dataset def get_loader(self, column): From f8aa7c2ef4dd9a1992d36ebd7dac25238cebcbbb Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Tue, 19 Apr 2016 16:54:11 -0400 Subject: [PATCH 2/6] TST: add test for case when null in expected column --- tests/pipeline/test_events.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/pipeline/test_events.py b/tests/pipeline/test_events.py index 1bf7e46e..107f40aa 100644 --- a/tests/pipeline/test_events.py +++ b/tests/pipeline/test_events.py @@ -7,7 +7,7 @@ from unittest import TestCase import blaze as bz from nose_parameterized import parameterized import pandas as pd -from pandas.util.testing import assert_series_equal +from pandas.util.testing import assert_series_equal, assert_frame_equal from zipline.pipeline.common import ( ANNOUNCEMENT_FIELD_NAME, @@ -119,6 +119,24 @@ class EventLoaderTestCase(TestCase): EventDataSetLoader ) + def test_null_in_expected_cols(self): + dates_with_null = pd.Series(dtx) + dates_with_null[2] = pd.NaT + events_by_sid = {0: pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: + dates_with_null, + TS_FIELD_NAME: dtx})} + loader = EventDataSetLoader( + dtx, + events_by_sid, + ) + + expected = events_by_sid[0].drop(2, axis=0).set_index(TS_FIELD_NAME) + # Check that index by first given date has been added + assert_frame_equal( + loader.events_by_sid[0], + expected, + ) + @parameterized.expand([ # DataFrame without timestamp column and infer_timestamps = True [pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx}), True], From e41c99d07746a188c27ae0e04806ba638165d623 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Wed, 20 Apr 2016 11:38:41 -0400 Subject: [PATCH 3/6] MAINT: add an event date col field to each loader MAINT: add event date col field and filter rows where this field is null TST: modify tests to filter nulls in event date col MAINT: calculate value repeats by vectorized computation on separate start and end dates. MAINT: pass DatetimeIndex instead of list of strings --- tests/pipeline/test_buyback_auth.py | 3 +- tests/pipeline/test_dividends.py | 46 ++++++++--------- tests/pipeline/test_events.py | 51 +++++++++++++++---- zipline/pipeline/loaders/buyback_auth.py | 8 +-- .../pipeline/loaders/consensus_estimates.py | 18 +------ zipline/pipeline/loaders/dividends.py | 20 +++----- zipline/pipeline/loaders/earnings.py | 6 +-- zipline/pipeline/loaders/events.py | 25 ++++----- zipline/pipeline/loaders/utils.py | 43 +++++++++------- zipline/testing/fixtures.py | 37 +++++++------- 10 files changed, 138 insertions(+), 119 deletions(-) diff --git a/tests/pipeline/test_buyback_auth.py b/tests/pipeline/test_buyback_auth.py index 9bf8e19c..e88e664c 100644 --- a/tests/pipeline/test_buyback_auth.py +++ b/tests/pipeline/test_buyback_auth.py @@ -42,7 +42,8 @@ from zipline.testing.fixtures import ( ) date_intervals = [ - [[None, '2014-01-04'], ['2014-01-05', '2014-01-09'], ['2014-01-10', None]] + [['2014-01-01', '2014-01-04'], ['2014-01-05', '2014-01-09'], + ['2014-01-10', '2014-01-31']] ] buyback_authorizations_cases = [ diff --git a/tests/pipeline/test_dividends.py b/tests/pipeline/test_dividends.py index 5c10b7bc..418517f3 100644 --- a/tests/pipeline/test_dividends.py +++ b/tests/pipeline/test_dividends.py @@ -98,41 +98,41 @@ dividends_cases = [ prev_date_intervals = [ [ - [None, '2014-01-14'], ['2014-01-15', '2014-01-19'], - ['2014-01-20', None] + ['2014-01-01', '2014-01-14'], ['2014-01-15', '2014-01-19'], + ['2014-01-20', '2014-01-31'] ], [ - [None, '2014-01-14'], ['2014-01-15', '2014-01-19'], - ['2014-01-20', None] + ['2014-01-01', '2014-01-14'], ['2014-01-15', '2014-01-19'], + ['2014-01-20', '2014-01-31'] ], [ - [None, '2014-01-09'], ['2014-01-10', '2014-01-19'], - ['2014-01-20', None] + ['2014-01-01', '2014-01-09'], ['2014-01-10', '2014-01-19'], + ['2014-01-20', '2014-01-31'] ], [ - [None, '2014-01-09'], ['2014-01-10', '2014-01-14'], - ['2014-01-15', None] + ['2014-01-01', '2014-01-09'], ['2014-01-10', '2014-01-14'], + ['2014-01-15', '2014-01-31'] ] ] next_date_intervals = [ [ - [None, '2014-01-04'], ['2014-01-05', '2014-01-15'], - ['2014-01-16', '2014-01-20'], ['2014-01-21', None] + ['2014-01-01', '2014-01-04'], ['2014-01-05', '2014-01-15'], + ['2014-01-16', '2014-01-20'], ['2014-01-21', '2014-01-31'] ], [ - [None, '2014-01-04'], ['2014-01-05', '2014-01-09'], + ['2014-01-01', '2014-01-04'], ['2014-01-05', '2014-01-09'], ['2014-01-10', '2014-01-15'], ['2014-01-16', '2014-01-20'], - ['2014-01-21', None] + ['2014-01-21', '2014-01-31'] ], [ - [None, '2014-01-04'], ['2014-01-05', '2014-01-10'], + ['2014-01-01', '2014-01-04'], ['2014-01-05', '2014-01-10'], ['2014-01-11', '2014-01-14'], ['2014-01-15', '2014-01-20'], - ['2014-01-21', None] + ['2014-01-21', '2014-01-31'] ], [ - [None, '2014-01-04'], ['2014-01-05', '2014-01-10'], - ['2014-01-11', '2014-01-15'], ['2014-01-16', None] + ['2014-01-01', '2014-01-04'], ['2014-01-05', '2014-01-10'], + ['2014-01-11', '2014-01-15'], ['2014-01-16', '2014-01-31'] ] ] @@ -184,19 +184,19 @@ class DividendsByAnnouncementDateTestCase(WithPipelineEventDataLoader, def setup(self, dates): date_intervals = [ [ - [None, '2014-01-04'], ['2014-01-05', '2014-01-09'], - ['2014-01-10', None] + ['2014-01-01', '2014-01-04'], ['2014-01-05', '2014-01-09'], + ['2014-01-10', '2014-01-31'] ], [ - [None, '2014-01-04'], ['2014-01-05', '2014-01-09'], - ['2014-01-10', None] + ['2014-01-01', '2014-01-04'], ['2014-01-05', '2014-01-09'], + ['2014-01-10', '2014-01-31'] ], [ - [None, '2014-01-04'], ['2014-01-05', '2014-01-14'], - ['2014-01-15', None] + ['2014-01-01', '2014-01-04'], ['2014-01-05', '2014-01-14'], + ['2014-01-15', '2014-01-31'] ], [ - [None, '2014-01-04'], ['2014-01-05', None] + ['2014-01-01', '2014-01-04'], ['2014-01-05', '2014-01-31'] ] ] announcement_dates = [['NaT', '2014-01-04', '2014-01-09'], diff --git a/tests/pipeline/test_events.py b/tests/pipeline/test_events.py index 107f40aa..50ba8a2f 100644 --- a/tests/pipeline/test_events.py +++ b/tests/pipeline/test_events.py @@ -26,20 +26,28 @@ from zipline.pipeline.loaders.events import ( WRONG_SINGLE_COL_DATA_FORMAT_ERROR ) from zipline.utils.memoize import lazyval -from zipline.utils.numpy_utils import datetime64ns_dtype +from zipline.utils.numpy_utils import datetime64ns_dtype, float64_dtype +OTHER_FIELD = "other_field" ABSTRACT_CONCRETE_LOADER_ERROR = 'abstract methods concrete_loader' -ABSTRACT_EXPECTED_COLS_ERROR = 'abstract methods expected_cols' +ABSTRACT_EXPECTED_COLS_ERROR = 'abstract methods event_date_col, expected_cols' class EventDataSet(DataSet): previous_announcement = Column(datetime64ns_dtype) +class OtherFieldEventDataSet(DataSet): + previous_announcement = Column(datetime64ns_dtype) + previous_other_field = Column(float64_dtype) + + class EventDataSetLoader(EventsLoader): expected_cols = frozenset([ANNOUNCEMENT_FIELD_NAME]) + event_date_col = ANNOUNCEMENT_FIELD_NAME + def __init__(self, all_dates, events_by_sid, @@ -56,21 +64,38 @@ class EventDataSetLoader(EventsLoader): def previous_announcement_loader(self): return self._previous_event_date_loader( self.dataset.previous_announcement, - ANNOUNCEMENT_FIELD_NAME, + ) + + +class EventDataSetLoaderMultipleExpectedCols(EventDataSetLoader): + expected_cols = frozenset([ANNOUNCEMENT_FIELD_NAME, OTHER_FIELD]) + event_date_col = ANNOUNCEMENT_FIELD_NAME + + def __init__(self, + all_dates, + events_by_sid, + infer_timestamps=False, + dataset=OtherFieldEventDataSet): + super(EventDataSetLoader, self).__init__( + all_dates, + events_by_sid, + infer_timestamps=infer_timestamps, + dataset=dataset, ) @lazyval - def next_announcement_loader(self): + def previous_other_field_loader(self): return self._previous_event_date_loader( self.dataset.previous_announcement, - ANNOUNCEMENT_FIELD_NAME, ) # Test case just for catching an error when multiple columns are in the wrong # data format, so no loader defined. -class EventDataSetLoaderMultipleExpectedCols(EventsLoader): - expected_cols = frozenset([ANNOUNCEMENT_FIELD_NAME, "other_field"]) +class EventDataSetLoaderMultipleExpectedColsNoColumnLoaders(EventsLoader): + expected_cols = frozenset([ANNOUNCEMENT_FIELD_NAME, OTHER_FIELD]) + + event_date_col = ANNOUNCEMENT_FIELD_NAME class EventDataSetLoaderNoExpectedCols(EventsLoader): @@ -119,13 +144,17 @@ class EventLoaderTestCase(TestCase): EventDataSetLoader ) - def test_null_in_expected_cols(self): + def test_null_in_event_date_col(self): + # Tests that getting a null date in the event date column filters the + # entire row from the data. dates_with_null = pd.Series(dtx) dates_with_null[2] = pd.NaT + other_col_data = pd.Series(range(0, len(dtx))) events_by_sid = {0: pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dates_with_null, + OTHER_FIELD: other_col_data, TS_FIELD_NAME: dtx})} - loader = EventDataSetLoader( + loader = EventDataSetLoaderMultipleExpectedCols( dtx, events_by_sid, ) @@ -216,7 +245,7 @@ class EventLoaderTestCase(TestCase): [dtx, dtx], True, WRONG_MANY_COL_DATA_FORMAT_ERROR.format(sid=0), - EventDataSetLoaderMultipleExpectedCols + EventDataSetLoaderMultipleExpectedColsNoColumnLoaders ], [ [dtx], @@ -230,7 +259,7 @@ class EventLoaderTestCase(TestCase): [dtx, dtx], False, WRONG_MANY_COL_DATA_FORMAT_ERROR.format(sid=0), - EventDataSetLoaderMultipleExpectedCols + EventDataSetLoaderMultipleExpectedColsNoColumnLoaders ] ] ) diff --git a/zipline/pipeline/loaders/buyback_auth.py b/zipline/pipeline/loaders/buyback_auth.py index b2123545..6d584809 100644 --- a/zipline/pipeline/loaders/buyback_auth.py +++ b/zipline/pipeline/loaders/buyback_auth.py @@ -27,6 +27,8 @@ class CashBuybackAuthorizationsLoader(EventsLoader): expected_cols = frozenset([BUYBACK_ANNOUNCEMENT_FIELD_NAME, CASH_FIELD_NAME]) + event_date_col = BUYBACK_ANNOUNCEMENT_FIELD_NAME + def __init__(self, all_dates, events_by_sid, @@ -43,7 +45,6 @@ class CashBuybackAuthorizationsLoader(EventsLoader): def cash_amount_loader(self): return self._previous_event_value_loader( self.dataset.cash_amount, - BUYBACK_ANNOUNCEMENT_FIELD_NAME, CASH_FIELD_NAME ) @@ -51,7 +52,6 @@ class CashBuybackAuthorizationsLoader(EventsLoader): def announcement_date_loader(self): return self._previous_event_date_loader( self.dataset.announcement_date, - BUYBACK_ANNOUNCEMENT_FIELD_NAME, ) @@ -70,6 +70,8 @@ class ShareBuybackAuthorizationsLoader(EventsLoader): expected_cols = frozenset([BUYBACK_ANNOUNCEMENT_FIELD_NAME, SHARE_COUNT_FIELD_NAME]) + event_date_col = BUYBACK_ANNOUNCEMENT_FIELD_NAME + def __init__(self, all_dates, events_by_sid, @@ -86,7 +88,6 @@ class ShareBuybackAuthorizationsLoader(EventsLoader): def share_count_loader(self): return self._previous_event_value_loader( self.dataset.share_count, - BUYBACK_ANNOUNCEMENT_FIELD_NAME, SHARE_COUNT_FIELD_NAME ) @@ -94,5 +95,4 @@ class ShareBuybackAuthorizationsLoader(EventsLoader): def announcement_date_loader(self): return self._previous_event_date_loader( self.dataset.announcement_date, - BUYBACK_ANNOUNCEMENT_FIELD_NAME, ) diff --git a/zipline/pipeline/loaders/consensus_estimates.py b/zipline/pipeline/loaders/consensus_estimates.py index 666931a1..f22378e0 100644 --- a/zipline/pipeline/loaders/consensus_estimates.py +++ b/zipline/pipeline/loaders/consensus_estimates.py @@ -29,6 +29,8 @@ class ConsensusEstimatesLoader(EventsLoader): LOW_FIELD_NAME, ACTUAL_VALUE_FIELD_NAME]) + event_date_col = RELEASE_DATE_FIELD_NAME + def __init__(self, all_dates, events_by_sid, infer_timestamps=False, dataset=ConsensusEstimates): @@ -40,21 +42,18 @@ class ConsensusEstimatesLoader(EventsLoader): def next_release_date_loader(self): return self._next_event_date_loader( self.dataset.next_release_date, - RELEASE_DATE_FIELD_NAME ) @lazyval def previous_release_date_loader(self): return self._previous_event_date_loader( self.dataset.previous_release_date, - RELEASE_DATE_FIELD_NAME, ) @lazyval def next_standard_deviation_loader(self): return self._next_event_value_loader( self.dataset.next_standard_deviation, - RELEASE_DATE_FIELD_NAME, STANDARD_DEVIATION_FIELD_NAME, ) @@ -62,7 +61,6 @@ class ConsensusEstimatesLoader(EventsLoader): def previous_standard_deviation_loader(self): return self._previous_event_value_loader( self.dataset.previous_standard_deviation, - RELEASE_DATE_FIELD_NAME, STANDARD_DEVIATION_FIELD_NAME, ) @@ -70,7 +68,6 @@ class ConsensusEstimatesLoader(EventsLoader): def next_count_loader(self): return self._next_event_value_loader( self.dataset.next_count, - RELEASE_DATE_FIELD_NAME, COUNT_FIELD_NAME, ) @@ -78,7 +75,6 @@ class ConsensusEstimatesLoader(EventsLoader): def previous_count_loader(self): return self._previous_event_value_loader( self.dataset.previous_count, - RELEASE_DATE_FIELD_NAME, COUNT_FIELD_NAME, ) @@ -86,7 +82,6 @@ class ConsensusEstimatesLoader(EventsLoader): def next_fiscal_quarter_loader(self): return self._next_event_value_loader( self.dataset.next_fiscal_quarter, - RELEASE_DATE_FIELD_NAME, FISCAL_QUARTER_FIELD_NAME, ) @@ -94,7 +89,6 @@ class ConsensusEstimatesLoader(EventsLoader): def previous_fiscal_quarter_loader(self): return self._previous_event_value_loader( self.dataset.previous_fiscal_quarter, - RELEASE_DATE_FIELD_NAME, FISCAL_QUARTER_FIELD_NAME, ) @@ -102,7 +96,6 @@ class ConsensusEstimatesLoader(EventsLoader): def next_high_loader(self): return self._next_event_value_loader( self.dataset.next_high, - RELEASE_DATE_FIELD_NAME, HIGH_FIELD_NAME, ) @@ -110,7 +103,6 @@ class ConsensusEstimatesLoader(EventsLoader): def previous_high_loader(self): return self._previous_event_value_loader( self.dataset.previous_high, - RELEASE_DATE_FIELD_NAME, HIGH_FIELD_NAME, ) @@ -118,7 +110,6 @@ class ConsensusEstimatesLoader(EventsLoader): def next_mean_loader(self): return self._next_event_value_loader( self.dataset.next_mean, - RELEASE_DATE_FIELD_NAME, MEAN_FIELD_NAME, ) @@ -126,7 +117,6 @@ class ConsensusEstimatesLoader(EventsLoader): def previous_mean_loader(self): return self._previous_event_value_loader( self.dataset.previous_mean, - RELEASE_DATE_FIELD_NAME, MEAN_FIELD_NAME, ) @@ -134,7 +124,6 @@ class ConsensusEstimatesLoader(EventsLoader): def next_fiscal_year_loader(self): return self._next_event_value_loader( self.dataset.next_fiscal_year, - RELEASE_DATE_FIELD_NAME, FISCAL_YEAR_FIELD_NAME, ) @@ -142,7 +131,6 @@ class ConsensusEstimatesLoader(EventsLoader): def previous_fiscal_year_loader(self): return self._previous_event_value_loader( self.dataset.previous_fiscal_year, - RELEASE_DATE_FIELD_NAME, FISCAL_YEAR_FIELD_NAME, ) @@ -150,7 +138,6 @@ class ConsensusEstimatesLoader(EventsLoader): def next_low_loader(self): return self._next_event_value_loader( self.dataset.next_low, - RELEASE_DATE_FIELD_NAME, LOW_FIELD_NAME, ) @@ -158,7 +145,6 @@ class ConsensusEstimatesLoader(EventsLoader): def previous_low_loader(self): return self._previous_event_value_loader( self.dataset.previous_low, - RELEASE_DATE_FIELD_NAME, LOW_FIELD_NAME, ) diff --git a/zipline/pipeline/loaders/dividends.py b/zipline/pipeline/loaders/dividends.py index cf6f2dad..339435c1 100644 --- a/zipline/pipeline/loaders/dividends.py +++ b/zipline/pipeline/loaders/dividends.py @@ -17,6 +17,8 @@ class DividendsByAnnouncementDateLoader(EventsLoader): expected_cols = frozenset([ANNOUNCEMENT_FIELD_NAME, CASH_AMOUNT_FIELD_NAME]) + event_date_col = ANNOUNCEMENT_FIELD_NAME + def __init__(self, all_dates, events_by_sid, infer_timestamps=False, dataset=DividendsByAnnouncementDate): @@ -28,14 +30,12 @@ class DividendsByAnnouncementDateLoader(EventsLoader): def previous_announcement_date_loader(self): return self._previous_event_date_loader( self.dataset.previous_announcement_date, - ANNOUNCEMENT_FIELD_NAME ) @lazyval def previous_amount_loader(self): return self._previous_event_value_loader( self.dataset.previous_amount, - ANNOUNCEMENT_FIELD_NAME, CASH_AMOUNT_FIELD_NAME ) @@ -44,6 +44,8 @@ class DividendsByPayDateLoader(EventsLoader): expected_cols = frozenset([PAY_DATE_FIELD_NAME, CASH_AMOUNT_FIELD_NAME]) + event_date_col = PAY_DATE_FIELD_NAME + def __init__(self, all_dates, events_by_sid, infer_timestamps=False, dataset=DividendsByPayDate): @@ -53,27 +55,23 @@ class DividendsByPayDateLoader(EventsLoader): @lazyval def next_date_loader(self): - return self._next_event_date_loader(self.dataset.next_date, - PAY_DATE_FIELD_NAME) + return self._next_event_date_loader(self.dataset.next_date) @lazyval def previous_date_loader(self): return self._previous_event_date_loader( self.dataset.previous_date, - PAY_DATE_FIELD_NAME ) @lazyval def next_amount_loader(self): return self._next_event_value_loader(self.dataset.next_amount, - PAY_DATE_FIELD_NAME, CASH_AMOUNT_FIELD_NAME) @lazyval def previous_amount_loader(self): return self._previous_event_value_loader( self.dataset.previous_amount, - PAY_DATE_FIELD_NAME, CASH_AMOUNT_FIELD_NAME ) @@ -82,6 +80,8 @@ class DividendsByExDateLoader(EventsLoader): expected_cols = frozenset([EX_DATE_FIELD_NAME, CASH_AMOUNT_FIELD_NAME]) + event_date_col = EX_DATE_FIELD_NAME + def __init__(self, all_dates, events_by_sid, infer_timestamps=False, dataset=DividendsByExDate): @@ -91,26 +91,22 @@ class DividendsByExDateLoader(EventsLoader): @lazyval def next_date_loader(self): - return self._next_event_date_loader(self.dataset.next_date, - EX_DATE_FIELD_NAME) + return self._next_event_date_loader(self.dataset.next_date) @lazyval def previous_date_loader(self): return self._previous_event_date_loader( self.dataset.previous_date, - EX_DATE_FIELD_NAME ) @lazyval def next_amount_loader(self): return self._next_event_value_loader(self.dataset.next_amount, - EX_DATE_FIELD_NAME, CASH_AMOUNT_FIELD_NAME) @lazyval def previous_amount_loader(self): return self._previous_event_value_loader( self.dataset.previous_amount, - EX_DATE_FIELD_NAME, CASH_AMOUNT_FIELD_NAME ) diff --git a/zipline/pipeline/loaders/earnings.py b/zipline/pipeline/loaders/earnings.py index 0a43e652..d4ed9bb6 100644 --- a/zipline/pipeline/loaders/earnings.py +++ b/zipline/pipeline/loaders/earnings.py @@ -12,6 +12,8 @@ class EarningsCalendarLoader(EventsLoader): expected_cols = frozenset([ANNOUNCEMENT_FIELD_NAME]) + event_date_col = ANNOUNCEMENT_FIELD_NAME + def __init__(self, all_dates, events_by_sid, infer_timestamps=False, dataset=EarningsCalendar): @@ -21,12 +23,10 @@ class EarningsCalendarLoader(EventsLoader): @lazyval def next_announcement_loader(self): - return self._next_event_date_loader(self.dataset.next_announcement, - ANNOUNCEMENT_FIELD_NAME) + return self._next_event_date_loader(self.dataset.next_announcement) @lazyval def previous_announcement_loader(self): return self._previous_event_date_loader( self.dataset.previous_announcement, - ANNOUNCEMENT_FIELD_NAME ) diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index 074ee680..19f68e7a 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -77,6 +77,10 @@ class EventsLoader(PipelineLoader): def expected_cols(self): raise NotImplemented('expected_cols') + @abc.abstractproperty + def event_date_col(self): + raise NotImplemented('event_date_col') + def __init__(self, all_dates, events_by_sid, @@ -148,7 +152,7 @@ class EventsLoader(PipelineLoader): raise ValueError( WRONG_MANY_COL_DATA_FORMAT_ERROR.format(sid=k) ) - self.events_by_sid = {sid: df.dropna(subset=self.expected_cols) for + self.events_by_sid = {sid: df.dropna(subset=[self.event_date_col]) for sid, df in self.events_by_sid.iteritems()} self.dataset = dataset @@ -165,7 +169,7 @@ class EventsLoader(PipelineLoader): for column in columns ) - def _next_event_date_loader(self, next_date_field, event_date_field_name): + def _next_event_date_loader(self, next_date_field): return DataFrameLoader( next_date_field, next_event_frame( @@ -173,15 +177,14 @@ class EventsLoader(PipelineLoader): self.all_dates, next_date_field.missing_value, next_date_field.dtype, - event_date_field_name, - event_date_field_name + self.event_date_col, + self.event_date_col ), adjustments=None, ) def _next_event_value_loader(self, next_value_field, - event_date_field_name, value_field_name): return DataFrameLoader( next_value_field, @@ -190,15 +193,14 @@ class EventsLoader(PipelineLoader): self.all_dates, next_value_field.missing_value, next_value_field.dtype, - event_date_field_name, + self.event_date_col, value_field_name ), adjustments=None, ) def _previous_event_date_loader(self, - prev_date_field, - event_date_field_name): + prev_date_field): return DataFrameLoader( prev_date_field, previous_event_frame( @@ -206,15 +208,14 @@ class EventsLoader(PipelineLoader): self.all_dates, NaTD, 'datetime64[ns]', - event_date_field_name, - event_date_field_name + self.event_date_col, + self.event_date_col ), adjustments=None, ) def _previous_event_value_loader(self, previous_value_field, - event_date_field_name, value_field_name): return DataFrameLoader( previous_value_field, @@ -223,7 +224,7 @@ class EventsLoader(PipelineLoader): self.all_dates, previous_value_field.missing_value, previous_value_field.dtype, - event_date_field_name, + self.event_date_col, value_field_name ), adjustments=None, diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 3b79253f..80642a25 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -280,25 +280,18 @@ def zip_with_floats(dates, flts): return pd.Series(flts, index=dates, dtype='float') -def num_days_in_range(dates, start, end): - """ - Return the number of days in `dates` between start and end, inclusive. - """ - start_idx, stop_idx = dates.slice_locs(start, end) - return stop_idx - start_idx - - def zip_with_dates(index_dates, dts): return pd.Series(pd.to_datetime(dts), index=index_dates) def get_values_for_date_ranges(zip_date_index_with_vals, vals_for_date_intervals, - date_intervals, + starts, + ends, date_index): """ - Returns a Series of values indexed by date based on values for the given - date intervals. + Returns a Series of values indexed by date based on the intervals defined + by the start and end dates. Parameters ---------- @@ -307,10 +300,10 @@ def get_values_for_date_ranges(zip_date_index_with_vals, returns a pd.Series with the values indexed by the dates. vals_for_date_intervals : list A list of values for each date interval in `date_intervals`. - date_intervals : list - A list of pairs of dates, where each pair represents a date interval - that corresponds to the value at the same index in - `vals_for_date_intervals`. + starts : DatetimeIndex + A DatetimeIndex of start dates. + ends : list + A DatetimeIndex of end dates. date_index : DatetimeIndex The DatetimeIndex containing all dates for which values were requested. @@ -321,10 +314,22 @@ def get_values_for_date_ranges(zip_date_index_with_vals, to dates based on the given date intervals. """ # Fill in given values for given date ranges. + end_indexes = date_index.values.searchsorted(ends) + start_indexes = date_index.values.searchsorted(starts) + num_days = (end_indexes - start_indexes) + 1 + + # In case any of the end dates falls on days missing from the date_index, + # searchsorted will have placed their index within `date_index` to the + # index of the next start date, so we will have added 1 extra day for + # each of these. Subtract those extra days, but ignore any cases where the + # start and end dates are equal. Note: if any of the start dates is + # missing, it won't affect calculations because searchsorted will advance + # the index to the next date within the same range. + num_days[np.where(~np.in1d(ends, date_index) & (num_days != 0))] -= 1 return zip_date_index_with_vals( date_index, - np.repeat(vals_for_date_intervals, - [num_days_in_range(date_index, *date_interval) - for date_interval in - date_intervals]), + np.repeat( + vals_for_date_intervals, + num_days, + ) ) diff --git a/zipline/testing/fixtures.py b/zipline/testing/fixtures.py index 35eaa406..3ae3deb8 100644 --- a/zipline/testing/fixtures.py +++ b/zipline/testing/fixtures.py @@ -912,7 +912,8 @@ class WithPipelineEventDataLoader(with_metaclass( frame = pd.DataFrame({sid: get_values_for_date_ranges( zip_date_index_with_vals, vals[sid], - date_intervals[sid], + pd.DatetimeIndex(zip(*date_intervals[sid])[0]), + pd.DatetimeIndex(zip(*date_intervals[sid])[1]), dates ) for sid in self.get_sids()[:-1]}) frame[self.get_sids()[-1]] = zip_date_index_with_vals( @@ -982,7 +983,7 @@ class WithPipelineEventDataLoader(with_metaclass( for sid in self.get_sids(): for col_name in cols.keys(): - assert_series_equal(result[col_name].xs(sid, level=1), + assert_series_equal(result[col_name].unstack(1)[sid], cols[col_name][sid], check_names=False) @@ -1109,24 +1110,24 @@ class WithNextAndPreviousEventDataLoader(WithPipelineEventDataLoader): ] next_date_intervals = [ - [[None, '2014-01-04'], + [['2014-01-01', '2014-01-04'], ['2014-01-05', '2014-01-15'], ['2014-01-16', '2014-01-20'], - ['2014-01-21', None]], - [[None, '2014-01-04'], + ['2014-01-21', '2014-01-31']], + [['2014-01-01', '2014-01-04'], ['2014-01-05', '2014-01-09'], ['2014-01-10', '2014-01-15'], ['2014-01-16', '2014-01-20'], - ['2014-01-21', None]], - [[None, '2014-01-04'], + ['2014-01-21', '2014-01-31']], + [['2014-01-01', '2014-01-04'], ['2014-01-05', '2014-01-10'], ['2014-01-11', '2014-01-14'], ['2014-01-15', '2014-01-20'], - ['2014-01-21', None]], - [[None, '2014-01-04'], + ['2014-01-21', '2014-01-31']], + [['2014-01-01', '2014-01-04'], ['2014-01-05', '2014-01-10'], ['2014-01-11', '2014-01-15'], - ['2014-01-16', None]] + ['2014-01-16', '2014-01-31']] ] next_dates = [ @@ -1138,18 +1139,18 @@ class WithNextAndPreviousEventDataLoader(WithPipelineEventDataLoader): ] prev_date_intervals = [ - [[None, '2014-01-14'], + [['2014-01-01', '2014-01-14'], ['2014-01-15', '2014-01-19'], - ['2014-01-20', None]], - [[None, '2014-01-14'], + ['2014-01-20', '2014-01-31']], + [['2014-01-01', '2014-01-14'], ['2014-01-15', '2014-01-19'], - ['2014-01-20', None]], - [[None, '2014-01-09'], + ['2014-01-20', '2014-01-31']], + [['2014-01-01', '2014-01-09'], ['2014-01-10', '2014-01-19'], - ['2014-01-20', None]], - [[None, '2014-01-09'], + ['2014-01-20', '2014-01-31']], + [['2014-01-01', '2014-01-09'], ['2014-01-10', '2014-01-14'], - ['2014-01-15', None]] + ['2014-01-15', '2014-01-31']] ] prev_dates = [ From 8585fd5b59f294f8ef2716f86f3efd7ba2426a2b Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Thu, 21 Apr 2016 15:01:03 -0400 Subject: [PATCH 4/6] MAINT: move filtering for nulls in date column to blaze loader --- zipline/pipeline/loaders/blaze/events.py | 4 +++- zipline/pipeline/loaders/events.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index 6fae418f..e951cac0 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -78,7 +78,9 @@ class BlazeEventsLoader(PipelineLoader): expected_fields = self._expected_fields self._expr = bind_expression_to_resources( - expr[list(expected_fields)], + expr[list(expected_fields)][expr[ + self.concrete_loader.event_date_col + ].notnull()], resources, ) self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {} diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index 19f68e7a..42afac97 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -152,8 +152,7 @@ class EventsLoader(PipelineLoader): raise ValueError( WRONG_MANY_COL_DATA_FORMAT_ERROR.format(sid=k) ) - self.events_by_sid = {sid: df.dropna(subset=[self.event_date_col]) for - sid, df in self.events_by_sid.iteritems()} + self.dataset = dataset def get_loader(self, column): From 390295481cfbcb1fee346c69cb266ebbd8b03b3a Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Thu, 21 Apr 2016 15:01:27 -0400 Subject: [PATCH 5/6] TST: add test for blaze loader with null data in date col MAINT: fix blaze query --- tests/pipeline/test_events.py | 94 +++++++++++------------- zipline/pipeline/loaders/blaze/events.py | 3 +- 2 files changed, 44 insertions(+), 53 deletions(-) diff --git a/tests/pipeline/test_events.py b/tests/pipeline/test_events.py index 50ba8a2f..0a9c8b1d 100644 --- a/tests/pipeline/test_events.py +++ b/tests/pipeline/test_events.py @@ -6,8 +6,9 @@ from unittest import TestCase import blaze as bz from nose_parameterized import parameterized +from numpy.testing import assert_array_equal import pandas as pd -from pandas.util.testing import assert_series_equal, assert_frame_equal +from pandas.util.testing import assert_series_equal from zipline.pipeline.common import ( ANNOUNCEMENT_FIELD_NAME, @@ -26,7 +27,7 @@ from zipline.pipeline.loaders.events import ( WRONG_SINGLE_COL_DATA_FORMAT_ERROR ) from zipline.utils.memoize import lazyval -from zipline.utils.numpy_utils import datetime64ns_dtype, float64_dtype +from zipline.utils.numpy_utils import datetime64ns_dtype OTHER_FIELD = "other_field" @@ -38,11 +39,6 @@ class EventDataSet(DataSet): previous_announcement = Column(datetime64ns_dtype) -class OtherFieldEventDataSet(DataSet): - previous_announcement = Column(datetime64ns_dtype) - previous_other_field = Column(float64_dtype) - - class EventDataSetLoader(EventsLoader): expected_cols = frozenset([ANNOUNCEMENT_FIELD_NAME]) @@ -67,29 +63,6 @@ class EventDataSetLoader(EventsLoader): ) -class EventDataSetLoaderMultipleExpectedCols(EventDataSetLoader): - expected_cols = frozenset([ANNOUNCEMENT_FIELD_NAME, OTHER_FIELD]) - event_date_col = ANNOUNCEMENT_FIELD_NAME - - def __init__(self, - all_dates, - events_by_sid, - infer_timestamps=False, - dataset=OtherFieldEventDataSet): - super(EventDataSetLoader, self).__init__( - all_dates, - events_by_sid, - infer_timestamps=infer_timestamps, - dataset=dataset, - ) - - @lazyval - def previous_other_field_loader(self): - return self._previous_event_date_loader( - self.dataset.previous_announcement, - ) - - # Test case just for catching an error when multiple columns are in the wrong # data format, so no loader defined. class EventDataSetLoaderMultipleExpectedColsNoColumnLoaders(EventsLoader): @@ -144,28 +117,6 @@ class EventLoaderTestCase(TestCase): EventDataSetLoader ) - def test_null_in_event_date_col(self): - # Tests that getting a null date in the event date column filters the - # entire row from the data. - dates_with_null = pd.Series(dtx) - dates_with_null[2] = pd.NaT - other_col_data = pd.Series(range(0, len(dtx))) - events_by_sid = {0: pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: - dates_with_null, - OTHER_FIELD: other_col_data, - TS_FIELD_NAME: dtx})} - loader = EventDataSetLoaderMultipleExpectedCols( - dtx, - events_by_sid, - ) - - expected = events_by_sid[0].drop(2, axis=0).set_index(TS_FIELD_NAME) - # Check that index by first given date has been added - assert_frame_equal( - loader.events_by_sid[0], - expected, - ) - @parameterized.expand([ # DataFrame without timestamp column and infer_timestamps = True [pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: dtx}), True], @@ -293,3 +244,42 @@ class BlazeEventLoaderTestCase(TestCase): SID_FIELD_NAME: 0}) ) ) + + +class BlazeEventDataSetLoader(BlazeEventsLoader): + concrete_loader = EventDataSetLoader + _expected_fields = frozenset({ANNOUNCEMENT_FIELD_NAME, + TS_FIELD_NAME, + SID_FIELD_NAME}) + + def __init__(self, + expr, + dataset=EventDataSet, + **kwargs): + super( + BlazeEventDataSetLoader, self + ).__init__(expr, + dataset=dataset, + **kwargs) + + +class BlazeEventLoaderNullInDateColumnTestCase(TestCase): + def test_null_in_event_date_col(self): + # Tests that if there is a null date in the event date column, it is + # filtered out and does not break on loading the adjusted array. + dates_with_null = pd.Series(dtx) + dates_with_null[2] = pd.NaT + events_by_sid = pd.DataFrame({SID_FIELD_NAME: 0, + ANNOUNCEMENT_FIELD_NAME: dates_with_null, + TS_FIELD_NAME: dtx}) + loader = BlazeEventDataSetLoader( + bz.data(events_by_sid), + ) + + result = loader.load_adjusted_array({ + EventDataSet.previous_announcement + }, dtx, [0], [True])[EventDataSet.previous_announcement].data[:, 0] + + expected = dates_with_null.copy(True) + expected[2] = dtx[1] + assert_array_equal(result, expected) diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index e951cac0..34d7f37d 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -77,8 +77,9 @@ class BlazeEventsLoader(PipelineLoader): ) expected_fields = self._expected_fields + expr = expr[list(expected_fields)] self._expr = bind_expression_to_resources( - expr[list(expected_fields)][expr[ + expr[expr[ self.concrete_loader.event_date_col ].notnull()], resources, From 0191d9d90354d8a83366a2374f7615299c4d1025 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Fri, 22 Apr 2016 11:54:55 -0400 Subject: [PATCH 6/6] MAINT: move filtering for null date rows back to dataframe TST: test both next and prev event frame loading and use EventsLoader. BUG: remove extra arg MAINT: call list on zip for compatibility with python 3 --- tests/pipeline/test_events.py | 58 ++++++++++++------- zipline/pipeline/loaders/blaze/events.py | 5 +- .../pipeline/loaders/consensus_estimates.py | 1 - zipline/pipeline/loaders/events.py | 3 +- zipline/testing/fixtures.py | 4 +- 5 files changed, 41 insertions(+), 30 deletions(-) diff --git a/tests/pipeline/test_events.py b/tests/pipeline/test_events.py index 0a9c8b1d..9663b94e 100644 --- a/tests/pipeline/test_events.py +++ b/tests/pipeline/test_events.py @@ -6,6 +6,7 @@ from unittest import TestCase import blaze as bz from nose_parameterized import parameterized +import numpy as np from numpy.testing import assert_array_equal import pandas as pd from pandas.util.testing import assert_series_equal @@ -37,6 +38,7 @@ ABSTRACT_EXPECTED_COLS_ERROR = 'abstract methods event_date_col, expected_cols' class EventDataSet(DataSet): previous_announcement = Column(datetime64ns_dtype) + next_announcement = Column(datetime64ns_dtype) class EventDataSetLoader(EventsLoader): @@ -62,6 +64,12 @@ class EventDataSetLoader(EventsLoader): self.dataset.previous_announcement, ) + @lazyval + def next_announcement_loader(self): + return self._next_event_date_loader( + self.dataset.next_announcement, + ) + # Test case just for catching an error when multiple columns are in the wrong # data format, so no loader defined. @@ -90,6 +98,34 @@ dtx = pd.date_range('2014-01-01', '2014-01-10') class EventLoaderTestCase(TestCase): + def test_null_in_event_date_col(self): + # Tests that if there is a null date in the event date column, it is + # filtered out and does not break on loading the adjusted array. + dates_with_null = pd.Series(dtx) + dates_with_null[2] = pd.NaT + events_by_sid = {0: pd.DataFrame({ANNOUNCEMENT_FIELD_NAME: + dates_with_null, + TS_FIELD_NAME: dtx})} + loader = EventDataSetLoader( + dtx, + events_by_sid, + ) + + prev_result = loader.load_adjusted_array({ + EventDataSet.previous_announcement + }, dtx, [0], [True])[EventDataSet.previous_announcement].data[:, 0] + + next_result = loader.load_adjusted_array({ + EventDataSet.next_announcement + }, dtx, [0], [True])[EventDataSet.next_announcement].data[:, 0] + + expected_prev = dates_with_null[:] + expected_prev[2] = dtx[1] + assert_array_equal(prev_result, expected_prev) + expected_next = dates_with_null[:] + expected_next[2] = np.datetime64('NaT') + assert_array_equal(next_result, expected_next) + def assert_loader_error(self, events_by_sid, error, msg, infer_timestamps, loader): with self.assertRaisesRegexp(error, re.escape(msg)): @@ -261,25 +297,3 @@ class BlazeEventDataSetLoader(BlazeEventsLoader): ).__init__(expr, dataset=dataset, **kwargs) - - -class BlazeEventLoaderNullInDateColumnTestCase(TestCase): - def test_null_in_event_date_col(self): - # Tests that if there is a null date in the event date column, it is - # filtered out and does not break on loading the adjusted array. - dates_with_null = pd.Series(dtx) - dates_with_null[2] = pd.NaT - events_by_sid = pd.DataFrame({SID_FIELD_NAME: 0, - ANNOUNCEMENT_FIELD_NAME: dates_with_null, - TS_FIELD_NAME: dtx}) - loader = BlazeEventDataSetLoader( - bz.data(events_by_sid), - ) - - result = loader.load_adjusted_array({ - EventDataSet.previous_announcement - }, dtx, [0], [True])[EventDataSet.previous_announcement].data[:, 0] - - expected = dates_with_null.copy(True) - expected[2] = dtx[1] - assert_array_equal(result, expected) diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index 34d7f37d..6fae418f 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -77,11 +77,8 @@ class BlazeEventsLoader(PipelineLoader): ) expected_fields = self._expected_fields - expr = expr[list(expected_fields)] self._expr = bind_expression_to_resources( - expr[expr[ - self.concrete_loader.event_date_col - ].notnull()], + expr[list(expected_fields)], resources, ) self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {} diff --git a/zipline/pipeline/loaders/consensus_estimates.py b/zipline/pipeline/loaders/consensus_estimates.py index f22378e0..9d8f20f8 100644 --- a/zipline/pipeline/loaders/consensus_estimates.py +++ b/zipline/pipeline/loaders/consensus_estimates.py @@ -152,6 +152,5 @@ class ConsensusEstimatesLoader(EventsLoader): def previous_actual_value_loader(self): return self._previous_event_value_loader( self.dataset.previous_actual_value, - RELEASE_DATE_FIELD_NAME, ACTUAL_VALUE_FIELD_NAME, ) diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index 42afac97..be1a8ba3 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -152,7 +152,8 @@ class EventsLoader(PipelineLoader): raise ValueError( WRONG_MANY_COL_DATA_FORMAT_ERROR.format(sid=k) ) - + self.events_by_sid = {sid: df.dropna(subset=[self.event_date_col]) for + sid, df in self.events_by_sid.items()} self.dataset = dataset def get_loader(self, column): diff --git a/zipline/testing/fixtures.py b/zipline/testing/fixtures.py index 3ae3deb8..d1547489 100644 --- a/zipline/testing/fixtures.py +++ b/zipline/testing/fixtures.py @@ -912,8 +912,8 @@ class WithPipelineEventDataLoader(with_metaclass( frame = pd.DataFrame({sid: get_values_for_date_ranges( zip_date_index_with_vals, vals[sid], - pd.DatetimeIndex(zip(*date_intervals[sid])[0]), - pd.DatetimeIndex(zip(*date_intervals[sid])[1]), + pd.DatetimeIndex(list(zip(*date_intervals[sid]))[0]), + pd.DatetimeIndex(list(zip(*date_intervals[sid]))[1]), dates ) for sid in self.get_sids()[:-1]}) frame[self.get_sids()[-1]] = zip_date_index_with_vals(