mirror of
https://github.com/wassname/catalyst.git
synced 2026-07-01 18:55:33 +08:00
ENH: improve performance of time comparisons
Adds `mask_time_between` to do more efficient comparisons between pandas.DatetimeIndex and datetime.time objects. This is used in the loader utils to more efficiently normalize datetimes around the query time.
This commit is contained in:
@@ -6,6 +6,7 @@ from six import iteritems
|
||||
from six.moves import zip
|
||||
|
||||
from zipline.utils.numpy_utils import categorical_dtype, NaTns
|
||||
from zipline.utils.pandas_utils import mask_between_time
|
||||
|
||||
|
||||
def next_event_frame(events_by_sid,
|
||||
@@ -209,6 +210,9 @@ def normalize_data_query_bounds(lower, upper, time, tz):
|
||||
return lower, upper
|
||||
|
||||
|
||||
_midnight = datetime.time(0, 0)
|
||||
|
||||
|
||||
def normalize_timestamp_to_query_time(df,
|
||||
time,
|
||||
tz,
|
||||
@@ -246,7 +250,12 @@ def normalize_timestamp_to_query_time(df,
|
||||
|
||||
dtidx = pd.DatetimeIndex(df.loc[:, ts_field], tz='utc')
|
||||
dtidx_local_time = dtidx.tz_convert(tz)
|
||||
to_roll_forward = dtidx_local_time.time >= time
|
||||
to_roll_forward = mask_between_time(
|
||||
dtidx_local_time,
|
||||
time,
|
||||
_midnight,
|
||||
include_end=False,
|
||||
)
|
||||
# for all of the times that are greater than our query time add 1
|
||||
# day and truncate to the date
|
||||
df.loc[to_roll_forward, ts_field] = (
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
"""
|
||||
Utilities for working with pandas objects.
|
||||
"""
|
||||
import operator as op
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
@@ -15,6 +17,86 @@ def explode(df):
|
||||
|
||||
try:
|
||||
# pandas 0.16 compat
|
||||
sort_values = pd.DataFrame.sort_values
|
||||
_df_sort_values = pd.DataFrame.sort_values
|
||||
_series_sort_values = pd.Series.sort_values
|
||||
except AttributeError:
|
||||
sort_values = pd.DataFrame.sort
|
||||
_df_sort_values = pd.DataFrame.sort
|
||||
_series_sort_values = pd.Series.sort
|
||||
|
||||
|
||||
def sort_values(ob, *args, **kwargs):
|
||||
if isinstance(ob, pd.DataFrame):
|
||||
return _df_sort_values(ob, *args, **kwargs)
|
||||
elif isinstance(ob, pd.Series):
|
||||
return _series_sort_values(ob, *args, **kwargs)
|
||||
raise ValueError(
|
||||
'sort_values expected a dataframe or series, not %s: %r' % (
|
||||
type(ob).__name__, ob,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _time_to_micros(time):
|
||||
"""Convert a time into milliseconds since midnight.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
time : datetime.time
|
||||
The time to convert.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ms : int
|
||||
The number of milliseconds since midnight.
|
||||
"""
|
||||
seconds = time.hour * 60 * 60 + 60 * time.minute + time.second
|
||||
return 1000000 * seconds + time.microsecond
|
||||
|
||||
|
||||
_opmap = {
|
||||
(True, True): (op.le, op.le),
|
||||
(True, False): (op.le, op.lt),
|
||||
(False, True): (op.lt, op.le),
|
||||
(False, False): (op.lt, op.lt),
|
||||
}
|
||||
|
||||
|
||||
def mask_between_time(dts, start, end, include_start=True, include_end=True):
|
||||
"""Return a mask of all of the datetimes in ``dts`` that are between
|
||||
``start`` and ``end``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dts : pd.DatetimeIndex
|
||||
The index to mask.
|
||||
start : time
|
||||
end : time
|
||||
The start and end times.
|
||||
include_start : bool, optional
|
||||
Inclusive on ``start``.
|
||||
include_end : bool, optional
|
||||
Inclusive on ``end``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mask : np.ndarray[bool]
|
||||
A bool array masking ``dts``.
|
||||
|
||||
See Also
|
||||
--------
|
||||
:meth:`pandas.DatetimeIndex.indexer_between_times`
|
||||
"""
|
||||
time_micros = dts._get_time_micros()
|
||||
start_micros = _time_to_micros(start)
|
||||
end_micros = _time_to_micros(end)
|
||||
|
||||
lop, rop = _opmap[include_start, include_end]
|
||||
if start_micros <= end_micros:
|
||||
join_op = op.and_
|
||||
else:
|
||||
join_op = op.or_
|
||||
|
||||
return join_op(
|
||||
lop(start_micros, time_micros),
|
||||
rop(time_micros, end_micros),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user