Merge pull request #1288 from quantopian/pandas-tz-normalize-bug-workaround

Pandas tz normalize bug workaround
This commit is contained in:
Maya Tydykov
2016-06-23 14:20:35 -04:00
committed by GitHub
2 changed files with 80 additions and 5 deletions
+73 -2
View File
@@ -1,9 +1,12 @@
"""
Tests for setting up an EventsLoader and a BlazeEventsLoader.
"""
from datetime import time
import itertools
from itertools import product
import blaze as bz
from nose_parameterized import parameterized
import numpy as np
import pandas as pd
@@ -17,10 +20,11 @@ from zipline.pipeline.data import DataSet, Column
from zipline.pipeline.loaders.events import EventsLoader
from zipline.pipeline.loaders.blaze.events import BlazeEventsLoader
from zipline.pipeline.loaders.utils import (
previous_event_indexer,
next_event_indexer,
normalize_timestamp_to_query_time,
previous_event_indexer,
)
from zipline.testing import ZiplineTestCase
from zipline.testing import check_arrays, ZiplineTestCase
from zipline.testing.fixtures import (
WithAssetFinder,
WithNYSETradingDays,
@@ -456,3 +460,70 @@ class BlazeEventsLoaderTestCase(EventsLoaderTestCase):
next_value_columns,
previous_value_columns,
)
class EventLoaderUtilsTestCase(ZiplineTestCase):
# These cases test the following:
# 1. Shuffling timestamps in DST/EST produces the correct normalized
# timestamps
# 2. Timestamps at query time boundaries are normalized correctly
boundary_dates = [pd.Timestamp('2013-01-04 8:44:59'),
pd.Timestamp('2013-01-04 8:45:00'),
pd.Timestamp('2013-01-04 8:46:00')]
us_boundary_dates = [date.tz_localize('US/Eastern') for date in
boundary_dates]
moscow_boundary_dates = [date.tz_localize('Europe/Moscow') for date in
boundary_dates]
mixed_tz_dates = [pd.Timestamp('2013-01-24'),
pd.Timestamp('2013-01-31 20:00:00'),
pd.Timestamp('2013-04-04'),
pd.Timestamp('2013-04-21')]
us_dates = pd.to_datetime(us_boundary_dates + mixed_tz_dates,
utc=True).tz_localize(None)
moscow_dates = pd.to_datetime(moscow_boundary_dates + mixed_tz_dates,
utc=True).tz_localize(None)
combos = list(map(np.array, itertools.permutations(np.arange(len(
boundary_dates + mixed_tz_dates)))))
expected_us = pd.Series(
[pd.Timestamp('2013-01-04'),
pd.Timestamp('2013-01-05'),
pd.Timestamp('2013-01-05'),
pd.Timestamp('2013-01-24'),
pd.Timestamp('2013-02-01'),
pd.Timestamp('2013-04-04'),
pd.Timestamp('2013-04-21')]
).values
# Russia's TZ offset is +4
expected_russia = pd.Series(
[pd.Timestamp('2013-01-04'),
pd.Timestamp('2013-01-05'),
pd.Timestamp('2013-01-05'),
pd.Timestamp('2013-01-24'),
pd.Timestamp('2013-01-31'),
pd.Timestamp('2013-04-04'),
pd.Timestamp('2013-04-21')]
).values
# Test with timezones on either side of the meridian
@parameterized.expand([(expected_us, 'US/Eastern', us_dates),
(expected_russia, 'Europe/Moscow', moscow_dates)])
def test_normalize_to_query_time(self, expected, tz, dates):
# Order matters in pandas 0.18.2. Prior to that, using tz_convert on
# a DatetimeIndex with DST/EST timestamps mixed resulted in some of
# them being an hour off (1 hour past midnight).
for scrambler in self.combos:
df = pd.DataFrame({"timestamp": dates[scrambler]})
result = normalize_timestamp_to_query_time(df,
time(8, 45),
tz,
inplace=False,
ts_field='timestamp')
timestamps = result['timestamp'].values
check_arrays(
timestamps,
expected[scrambler]
)
+7 -3
View File
@@ -237,11 +237,15 @@ def normalize_timestamp_to_query_time(df,
_midnight,
include_end=False,
)
# for all of the times that are greater than our query time add 1
# day and truncate to the date
# For all of the times that are greater than our query time add 1
# day and truncate to the date.
# We normalize twice here because of a bug in pandas 0.16.1 that causes
# tz_localize() to shift some timestamps by an hour if they are not grouped
# together by DST/EST.
df.loc[to_roll_forward, ts_field] = (
dtidx_local_time[to_roll_forward] + datetime.timedelta(days=1)
).normalize().tz_localize(None).tz_localize('utc') # cast back to utc
).normalize().tz_localize(None).tz_localize('utc').normalize()
df.loc[~to_roll_forward, ts_field] = dtidx[~to_roll_forward].normalize()
return df