ENH: improve performance of time comparisons

Adds `mask_time_between` to do more efficient comparisons between
pandas.DatetimeIndex and datetime.time objects.

This is used in the loader utils to more efficiently normalize datetimes
around the query time.
This commit is contained in:
Joe Jevnik
2016-05-24 20:21:53 -04:00
parent d34b1b9224
commit 2d36a58add
2 changed files with 94 additions and 3 deletions
+10 -1
View File
@@ -6,6 +6,7 @@ from six import iteritems
from six.moves import zip
from zipline.utils.numpy_utils import categorical_dtype, NaTns
from zipline.utils.pandas_utils import mask_between_time
def next_event_frame(events_by_sid,
@@ -209,6 +210,9 @@ def normalize_data_query_bounds(lower, upper, time, tz):
return lower, upper
_midnight = datetime.time(0, 0)
def normalize_timestamp_to_query_time(df,
time,
tz,
@@ -246,7 +250,12 @@ def normalize_timestamp_to_query_time(df,
dtidx = pd.DatetimeIndex(df.loc[:, ts_field], tz='utc')
dtidx_local_time = dtidx.tz_convert(tz)
to_roll_forward = dtidx_local_time.time >= time
to_roll_forward = mask_between_time(
dtidx_local_time,
time,
_midnight,
include_end=False,
)
# for all of the times that are greater than our query time add 1
# day and truncate to the date
df.loc[to_roll_forward, ts_field] = (
+84 -2
View File
@@ -1,6 +1,8 @@
"""
Utilities for working with pandas objects.
"""
import operator as op
import pandas as pd
@@ -15,6 +17,86 @@ def explode(df):
try:
# pandas 0.16 compat
sort_values = pd.DataFrame.sort_values
_df_sort_values = pd.DataFrame.sort_values
_series_sort_values = pd.Series.sort_values
except AttributeError:
sort_values = pd.DataFrame.sort
_df_sort_values = pd.DataFrame.sort
_series_sort_values = pd.Series.sort
def sort_values(ob, *args, **kwargs):
if isinstance(ob, pd.DataFrame):
return _df_sort_values(ob, *args, **kwargs)
elif isinstance(ob, pd.Series):
return _series_sort_values(ob, *args, **kwargs)
raise ValueError(
'sort_values expected a dataframe or series, not %s: %r' % (
type(ob).__name__, ob,
),
)
def _time_to_micros(time):
"""Convert a time into milliseconds since midnight.
Parameters
----------
time : datetime.time
The time to convert.
Returns
-------
ms : int
The number of milliseconds since midnight.
"""
seconds = time.hour * 60 * 60 + 60 * time.minute + time.second
return 1000000 * seconds + time.microsecond
_opmap = {
(True, True): (op.le, op.le),
(True, False): (op.le, op.lt),
(False, True): (op.lt, op.le),
(False, False): (op.lt, op.lt),
}
def mask_between_time(dts, start, end, include_start=True, include_end=True):
"""Return a mask of all of the datetimes in ``dts`` that are between
``start`` and ``end``.
Parameters
----------
dts : pd.DatetimeIndex
The index to mask.
start : time
end : time
The start and end times.
include_start : bool, optional
Inclusive on ``start``.
include_end : bool, optional
Inclusive on ``end``.
Returns
-------
mask : np.ndarray[bool]
A bool array masking ``dts``.
See Also
--------
:meth:`pandas.DatetimeIndex.indexer_between_times`
"""
time_micros = dts._get_time_micros()
start_micros = _time_to_micros(start)
end_micros = _time_to_micros(end)
lop, rop = _opmap[include_start, include_end]
if start_micros <= end_micros:
join_op = op.and_
else:
join_op = op.or_
return join_op(
lop(start_micros, time_micros),
rop(time_micros, end_micros),
)