From 2d36a58add296becea63b27ca7e8455296afb907 Mon Sep 17 00:00:00 2001 From: Joe Jevnik Date: Tue, 24 May 2016 20:21:53 -0400 Subject: [PATCH] ENH: improve performance of time comparisons Adds `mask_time_between` to do more efficient comparisons between pandas.DatetimeIndex and datetime.time objects. This is used in the loader utils to more efficiently normalize datetimes around the query time. --- zipline/pipeline/loaders/utils.py | 11 +++- zipline/utils/pandas_utils.py | 86 ++++++++++++++++++++++++++++++- 2 files changed, 94 insertions(+), 3 deletions(-) diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index ec15680c..1eb5b371 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -6,6 +6,7 @@ from six import iteritems from six.moves import zip from zipline.utils.numpy_utils import categorical_dtype, NaTns +from zipline.utils.pandas_utils import mask_between_time def next_event_frame(events_by_sid, @@ -209,6 +210,9 @@ def normalize_data_query_bounds(lower, upper, time, tz): return lower, upper +_midnight = datetime.time(0, 0) + + def normalize_timestamp_to_query_time(df, time, tz, @@ -246,7 +250,12 @@ def normalize_timestamp_to_query_time(df, dtidx = pd.DatetimeIndex(df.loc[:, ts_field], tz='utc') dtidx_local_time = dtidx.tz_convert(tz) - to_roll_forward = dtidx_local_time.time >= time + to_roll_forward = mask_between_time( + dtidx_local_time, + time, + _midnight, + include_end=False, + ) # for all of the times that are greater than our query time add 1 # day and truncate to the date df.loc[to_roll_forward, ts_field] = ( diff --git a/zipline/utils/pandas_utils.py b/zipline/utils/pandas_utils.py index 2dd282e9..79d45de4 100644 --- a/zipline/utils/pandas_utils.py +++ b/zipline/utils/pandas_utils.py @@ -1,6 +1,8 @@ """ Utilities for working with pandas objects. """ +import operator as op + import pandas as pd @@ -15,6 +17,86 @@ def explode(df): try: # pandas 0.16 compat - sort_values = pd.DataFrame.sort_values + _df_sort_values = pd.DataFrame.sort_values + _series_sort_values = pd.Series.sort_values except AttributeError: - sort_values = pd.DataFrame.sort + _df_sort_values = pd.DataFrame.sort + _series_sort_values = pd.Series.sort + + +def sort_values(ob, *args, **kwargs): + if isinstance(ob, pd.DataFrame): + return _df_sort_values(ob, *args, **kwargs) + elif isinstance(ob, pd.Series): + return _series_sort_values(ob, *args, **kwargs) + raise ValueError( + 'sort_values expected a dataframe or series, not %s: %r' % ( + type(ob).__name__, ob, + ), + ) + + +def _time_to_micros(time): + """Convert a time into milliseconds since midnight. + + Parameters + ---------- + time : datetime.time + The time to convert. + + Returns + ------- + ms : int + The number of milliseconds since midnight. + """ + seconds = time.hour * 60 * 60 + 60 * time.minute + time.second + return 1000000 * seconds + time.microsecond + + +_opmap = { + (True, True): (op.le, op.le), + (True, False): (op.le, op.lt), + (False, True): (op.lt, op.le), + (False, False): (op.lt, op.lt), +} + + +def mask_between_time(dts, start, end, include_start=True, include_end=True): + """Return a mask of all of the datetimes in ``dts`` that are between + ``start`` and ``end``. + + Parameters + ---------- + dts : pd.DatetimeIndex + The index to mask. + start : time + end : time + The start and end times. + include_start : bool, optional + Inclusive on ``start``. + include_end : bool, optional + Inclusive on ``end``. + + Returns + ------- + mask : np.ndarray[bool] + A bool array masking ``dts``. + + See Also + -------- + :meth:`pandas.DatetimeIndex.indexer_between_times` + """ + time_micros = dts._get_time_micros() + start_micros = _time_to_micros(start) + end_micros = _time_to_micros(end) + + lop, rop = _opmap[include_start, include_end] + if start_micros <= end_micros: + join_op = op.and_ + else: + join_op = op.or_ + + return join_op( + lop(start_micros, time_micros), + rop(time_micros, end_micros), + )