Merge pull request #1288 from quantopian/pandas-tz-normalize-bug-workaround

Pandas tz normalize bug workaround
2026-07-02 19:03:42 +08:00 · 2016-06-23 14:20:35 -04:00
parent abd10d0ca2 07b84f87fb
commit c26e12bad8
2 changed files with 80 additions and 5 deletions
@@ -1,9 +1,12 @@
 """
 Tests for setting up an EventsLoader and a BlazeEventsLoader.
 """
+from datetime import time
+import itertools
 from itertools import product

 import blaze as bz
+from nose_parameterized import parameterized
 import numpy as np
 import pandas as pd

@@ -17,10 +20,11 @@ from zipline.pipeline.data import DataSet, Column
 from zipline.pipeline.loaders.events import EventsLoader
 from zipline.pipeline.loaders.blaze.events import BlazeEventsLoader
 from zipline.pipeline.loaders.utils import (
-    previous_event_indexer,
    next_event_indexer,
+    normalize_timestamp_to_query_time,
+    previous_event_indexer,
 )
-from zipline.testing import ZiplineTestCase
+from zipline.testing import check_arrays, ZiplineTestCase
 from zipline.testing.fixtures import (
    WithAssetFinder,
    WithNYSETradingDays,
@@ -456,3 +460,70 @@ class BlazeEventsLoaderTestCase(EventsLoaderTestCase):
            next_value_columns,
            previous_value_columns,
        )
+
+
+class EventLoaderUtilsTestCase(ZiplineTestCase):
+    # These cases test the following:
+    # 1. Shuffling timestamps in DST/EST produces the correct normalized
+    # timestamps
+    # 2. Timestamps at query time boundaries are normalized correctly
+    boundary_dates = [pd.Timestamp('2013-01-04 8:44:59'),
+                      pd.Timestamp('2013-01-04 8:45:00'),
+                      pd.Timestamp('2013-01-04 8:46:00')]
+    us_boundary_dates = [date.tz_localize('US/Eastern') for date in
+                         boundary_dates]
+    moscow_boundary_dates = [date.tz_localize('Europe/Moscow') for date in
+                             boundary_dates]
+    mixed_tz_dates = [pd.Timestamp('2013-01-24'),
+                      pd.Timestamp('2013-01-31 20:00:00'),
+                      pd.Timestamp('2013-04-04'),
+                      pd.Timestamp('2013-04-21')]
+    us_dates = pd.to_datetime(us_boundary_dates + mixed_tz_dates,
+                              utc=True).tz_localize(None)
+    moscow_dates = pd.to_datetime(moscow_boundary_dates + mixed_tz_dates,
+                                  utc=True).tz_localize(None)
+
+    combos = list(map(np.array, itertools.permutations(np.arange(len(
+        boundary_dates + mixed_tz_dates)))))
+
+    expected_us = pd.Series(
+        [pd.Timestamp('2013-01-04'),
+         pd.Timestamp('2013-01-05'),
+         pd.Timestamp('2013-01-05'),
+         pd.Timestamp('2013-01-24'),
+         pd.Timestamp('2013-02-01'),
+         pd.Timestamp('2013-04-04'),
+         pd.Timestamp('2013-04-21')]
+    ).values
+
+    # Russia's TZ offset is +4
+    expected_russia = pd.Series(
+        [pd.Timestamp('2013-01-04'),
+         pd.Timestamp('2013-01-05'),
+         pd.Timestamp('2013-01-05'),
+         pd.Timestamp('2013-01-24'),
+         pd.Timestamp('2013-01-31'),
+         pd.Timestamp('2013-04-04'),
+         pd.Timestamp('2013-04-21')]
+    ).values
+
+    # Test with timezones on either side of the meridian
+    @parameterized.expand([(expected_us, 'US/Eastern', us_dates),
+                           (expected_russia, 'Europe/Moscow', moscow_dates)])
+    def test_normalize_to_query_time(self, expected, tz, dates):
+        # Order matters in pandas 0.18.2. Prior to that, using tz_convert on
+        # a DatetimeIndex with DST/EST timestamps mixed resulted in some of
+        # them being an hour off (1 hour past midnight).
+        for scrambler in self.combos:
+            df = pd.DataFrame({"timestamp": dates[scrambler]})
+            result = normalize_timestamp_to_query_time(df,
+                                                       time(8, 45),
+                                                       tz,
+                                                       inplace=False,
+                                                       ts_field='timestamp')
+
+            timestamps = result['timestamp'].values
+            check_arrays(
+                timestamps,
+                expected[scrambler]
+            )
@@ -237,11 +237,15 @@ def normalize_timestamp_to_query_time(df,
        _midnight,
        include_end=False,
    )
-    # for all of the times that are greater than our query time add 1
-    # day and truncate to the date
+    # For all of the times that are greater than our query time add 1
+    # day and truncate to the date.
+    # We normalize twice here because of a bug in pandas 0.16.1 that causes
+    # tz_localize() to shift some timestamps by an hour if they are not grouped
+    # together by DST/EST.
    df.loc[to_roll_forward, ts_field] = (
        dtidx_local_time[to_roll_forward] + datetime.timedelta(days=1)
-    ).normalize().tz_localize(None).tz_localize('utc')  # cast back to utc
+    ).normalize().tz_localize(None).tz_localize('utc').normalize()
+
    df.loc[~to_roll_forward, ts_field] = dtidx[~to_roll_forward].normalize()
    return df