diff --git a/tests/pipeline/test_events.py b/tests/pipeline/test_events.py index 5747f13c..4b19a932 100644 --- a/tests/pipeline/test_events.py +++ b/tests/pipeline/test_events.py @@ -1,9 +1,12 @@ """ Tests for setting up an EventsLoader and a BlazeEventsLoader. """ +from datetime import time +import itertools from itertools import product import blaze as bz +from nose_parameterized import parameterized import numpy as np import pandas as pd @@ -17,10 +20,11 @@ from zipline.pipeline.data import DataSet, Column from zipline.pipeline.loaders.events import EventsLoader from zipline.pipeline.loaders.blaze.events import BlazeEventsLoader from zipline.pipeline.loaders.utils import ( - previous_event_indexer, next_event_indexer, + normalize_timestamp_to_query_time, + previous_event_indexer, ) -from zipline.testing import ZiplineTestCase +from zipline.testing import check_arrays, ZiplineTestCase from zipline.testing.fixtures import ( WithAssetFinder, WithNYSETradingDays, @@ -456,3 +460,70 @@ class BlazeEventsLoaderTestCase(EventsLoaderTestCase): next_value_columns, previous_value_columns, ) + + +class EventLoaderUtilsTestCase(ZiplineTestCase): + # These cases test the following: + # 1. Shuffling timestamps in DST/EST produces the correct normalized + # timestamps + # 2. Timestamps at query time boundaries are normalized correctly + boundary_dates = [pd.Timestamp('2013-01-04 8:44:59'), + pd.Timestamp('2013-01-04 8:45:00'), + pd.Timestamp('2013-01-04 8:46:00')] + us_boundary_dates = [date.tz_localize('US/Eastern') for date in + boundary_dates] + moscow_boundary_dates = [date.tz_localize('Europe/Moscow') for date in + boundary_dates] + mixed_tz_dates = [pd.Timestamp('2013-01-24'), + pd.Timestamp('2013-01-31 20:00:00'), + pd.Timestamp('2013-04-04'), + pd.Timestamp('2013-04-21')] + us_dates = pd.to_datetime(us_boundary_dates + mixed_tz_dates, + utc=True).tz_localize(None) + moscow_dates = pd.to_datetime(moscow_boundary_dates + mixed_tz_dates, + utc=True).tz_localize(None) + + combos = list(map(np.array, itertools.permutations(np.arange(len( + boundary_dates + mixed_tz_dates))))) + + expected_us = pd.Series( + [pd.Timestamp('2013-01-04'), + pd.Timestamp('2013-01-05'), + pd.Timestamp('2013-01-05'), + pd.Timestamp('2013-01-24'), + pd.Timestamp('2013-02-01'), + pd.Timestamp('2013-04-04'), + pd.Timestamp('2013-04-21')] + ).values + + # Russia's TZ offset is +4 + expected_russia = pd.Series( + [pd.Timestamp('2013-01-04'), + pd.Timestamp('2013-01-05'), + pd.Timestamp('2013-01-05'), + pd.Timestamp('2013-01-24'), + pd.Timestamp('2013-01-31'), + pd.Timestamp('2013-04-04'), + pd.Timestamp('2013-04-21')] + ).values + + # Test with timezones on either side of the meridian + @parameterized.expand([(expected_us, 'US/Eastern', us_dates), + (expected_russia, 'Europe/Moscow', moscow_dates)]) + def test_normalize_to_query_time(self, expected, tz, dates): + # Order matters in pandas 0.18.2. Prior to that, using tz_convert on + # a DatetimeIndex with DST/EST timestamps mixed resulted in some of + # them being an hour off (1 hour past midnight). + for scrambler in self.combos: + df = pd.DataFrame({"timestamp": dates[scrambler]}) + result = normalize_timestamp_to_query_time(df, + time(8, 45), + tz, + inplace=False, + ts_field='timestamp') + + timestamps = result['timestamp'].values + check_arrays( + timestamps, + expected[scrambler] + ) diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 6036d421..77a9f447 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -237,11 +237,15 @@ def normalize_timestamp_to_query_time(df, _midnight, include_end=False, ) - # for all of the times that are greater than our query time add 1 - # day and truncate to the date + # For all of the times that are greater than our query time add 1 + # day and truncate to the date. + # We normalize twice here because of a bug in pandas 0.16.1 that causes + # tz_localize() to shift some timestamps by an hour if they are not grouped + # together by DST/EST. df.loc[to_roll_forward, ts_field] = ( dtidx_local_time[to_roll_forward] + datetime.timedelta(days=1) - ).normalize().tz_localize(None).tz_localize('utc') # cast back to utc + ).normalize().tz_localize(None).tz_localize('utc').normalize() + df.loc[~to_roll_forward, ts_field] = dtidx[~to_roll_forward].normalize() return df