MAINT: Modifies minute bars to use a dict of OHLC ratios (#1428)

For scaling up pricing data before writing to bcolz, the writer now
accepts a dict mapping each sid to the ratio to use. It still accepts a
single ratio as default_ohlc_ratio, which is used as a fallback if no
mapping exists for a given sid. The default is OHLC_RATIO (1000).

This allows better handling of futures pricing data, where the required
precision across root symbols is not consistent.
This commit is contained in:
Andrew Daniels
2016-08-24 13:14:16 -04:00
committed by GitHub
parent eba78aec1b
commit 012888fcc7
+80 -23
View File
@@ -24,6 +24,7 @@ from intervaltree import IntervalTree
import numpy as np
import pandas as pd
from six import with_metaclass
from toolz import keymap, valmap
from zipline.data._minute_bar_internal import (
minute_value,
@@ -208,7 +209,7 @@ class BcolzMinuteBarMetadata(object):
minutes_per_day : int
The number of minutes per each period.
"""
FORMAT_VERSION = 2
FORMAT_VERSION = 3
METADATA_FILENAME = 'metadata.json'
@@ -229,7 +230,7 @@ class BcolzMinuteBarMetadata(object):
# if version does not match.
version = 0
ohlc_ratio = raw_data['ohlc_ratio']
default_ohlc_ratio = raw_data['ohlc_ratio']
if version >= 1:
minutes_per_day = raw_data['minutes_per_day']
@@ -254,8 +255,16 @@ class BcolzMinuteBarMetadata(object):
raw_data['market_closes'][-1], unit='m', tz='UTC')
)
if version >= 3:
ohlc_ratios_per_sid = raw_data['ohlc_ratios_per_sid']
if ohlc_ratios_per_sid is not None:
ohlc_ratios_per_sid = keymap(int, ohlc_ratios_per_sid)
else:
ohlc_ratios_per_sid = None
return cls(
ohlc_ratio,
default_ohlc_ratio,
ohlc_ratios_per_sid,
calendar,
start_session,
end_session,
@@ -265,7 +274,8 @@ class BcolzMinuteBarMetadata(object):
def __init__(
self,
ohlc_ratio,
default_ohlc_ratio,
ohlc_ratios_per_sid,
calendar,
start_session,
end_session,
@@ -275,7 +285,8 @@ class BcolzMinuteBarMetadata(object):
self.calendar = calendar
self.start_session = start_session
self.end_session = end_session
self.ohlc_ratio = ohlc_ratio
self.default_ohlc_ratio = default_ohlc_ratio
self.ohlc_ratios_per_sid = ohlc_ratios_per_sid
self.minutes_per_day = minutes_per_day
self.version = version
@@ -288,8 +299,14 @@ class BcolzMinuteBarMetadata(object):
version : int
The value of FORMAT_VERSION of this class.
ohlc_ratio : int
The factor by which the pricing data is multiplied so that the
float data can be stored as an integer.
The default ratio by which to multiply the pricing data to
convert the floats from floats to an integer to fit within
the np.uint32. If ohlc_ratios_per_sid is None or does not
contain a mapping for a given sid, this ratio is used.
ohlc_ratios_per_sid : dict
A dict mapping each sid in the output to the factor by
which the pricing data is multiplied so that the float data
can be stored as an integer.
minutes_per_day : int
The number of minutes per each period.
calendar_name : str
@@ -326,7 +343,8 @@ class BcolzMinuteBarMetadata(object):
metadata = {
'version': self.version,
'ohlc_ratio': self.ohlc_ratio,
'ohlc_ratio': self.default_ohlc_ratio,
'ohlc_ratios_per_sid': self.ohlc_ratios_per_sid,
'minutes_per_day': self.minutes_per_day,
'calendar_name': self.calendar.name,
'start_session': str(self.start_session.date()),
@@ -365,12 +383,15 @@ class BcolzMinuteBarWriter(object):
The first trading session in the data set.
end_session : datetime
The last trading session in the data set.
ohlc_ratio : int, optional
The ratio by which to multiply the pricing data to convert the
floats from floats to an integer to fit within the np.uint32.
The default is 1000 to support pricing data which comes in to the
thousands place.
default_ohlc_ratio : int, optional
The default ratio by which to multiply the pricing data to
convert from floats to integers that fit within np.uint32. If
ohlc_ratios_per_sid is None or does not contain a mapping for a
given sid, this ratio is used. Default is OHLC_RATIO (1000).
ohlc_ratios_per_sid : dict, optional
A dict mapping each sid in the output to the ratio by which to
multiply the pricing data to convert the floats from floats to
an integer to fit within the np.uint32.
expectedlen : int, optional
The expected length of the dataset, used when creating the initial
bcolz ctable.
@@ -434,7 +455,8 @@ class BcolzMinuteBarWriter(object):
start_session,
end_session,
minutes_per_day,
ohlc_ratio=OHLC_RATIO,
default_ohlc_ratio=OHLC_RATIO,
ohlc_ratios_per_sid=None,
expectedlen=DEFAULT_EXPECTEDLEN):
self._rootdir = rootdir
@@ -447,13 +469,15 @@ class BcolzMinuteBarWriter(object):
self._session_labels = self._schedule.index
self._minutes_per_day = minutes_per_day
self._expectedlen = expectedlen
self._ohlc_ratio = ohlc_ratio
self._default_ohlc_ratio = default_ohlc_ratio
self._ohlc_ratios_per_sid = ohlc_ratios_per_sid
self._minute_index = _calc_minute_index(
self._schedule.market_open, self._minutes_per_day)
metadata = BcolzMinuteBarMetadata(
self._ohlc_ratio,
self._default_ohlc_ratio,
self._ohlc_ratios_per_sid,
self._calendar,
self._start_session,
self._end_session,
@@ -465,6 +489,17 @@ class BcolzMinuteBarWriter(object):
def first_trading_day(self):
return self._start_session
def ohlc_ratio_for_sid(self, sid):
if self._ohlc_ratios_per_sid is not None:
try:
return self._ohlc_ratios_per_sid[sid]
except KeyError:
pass
# If no ohlc_ratios_per_sid dict is passed, or if the specified
# sid is not in the dict, fallback to the general ohlc_ratio.
return self._default_ohlc_ratio
def sidpath(self, sid):
"""
Parameters:
@@ -772,7 +807,7 @@ class BcolzMinuteBarWriter(object):
dt_ixs = np.searchsorted(all_minutes_in_window.values,
dts.astype('datetime64[ns]'))
ohlc_ratio = self._ohlc_ratio
ohlc_ratio = self.ohlc_ratio_for_sid(sid)
def convert_col(col):
"""Adapt float column into a uint32 column.
@@ -832,7 +867,13 @@ class BcolzMinuteBarReader(MinuteBarReader):
self._market_close_values = self._market_closes.values.\
astype('datetime64[m]').astype(np.int64)
self._ohlc_inverse = 1.0 / metadata.ohlc_ratio
self._default_ohlc_inverse = 1.0 / metadata.default_ohlc_ratio
ohlc_ratios = metadata.ohlc_ratios_per_sid
if ohlc_ratios:
self._ohlc_inverses_per_sid = (
valmap(lambda x: 1.0 / x, ohlc_ratios))
else:
self._ohlc_inverses_per_sid = None
self._minutes_per_day = metadata.minutes_per_day
@@ -856,6 +897,17 @@ class BcolzMinuteBarReader(MinuteBarReader):
def first_trading_day(self):
return self._start_session
def _ohlc_ratio_inverse_for_sid(self, sid):
if self._ohlc_inverses_per_sid is not None:
try:
return self._ohlc_inverses_per_sid[sid]
except KeyError:
pass
# If we can not get a sid-specific OHLC inverse for this sid,
# fallback to the default.
return self._default_ohlc_inverse
def _minutes_to_exclude(self):
"""
Calculate the minutes which should be excluded when a window
@@ -999,8 +1051,9 @@ class BcolzMinuteBarReader(MinuteBarReader):
return 0
else:
return np.nan
if field != 'volume':
value *= self._ohlc_inverse
value *= self._ohlc_ratio_inverse_for_sid(sid)
return value
def get_last_traded_dt(self, asset, dt):
@@ -1111,11 +1164,15 @@ class BcolzMinuteBarReader(MinuteBarReader):
excl_slice = np.s_[
excl_start - start_idx:excl_stop - start_idx + 1]
values = np.delete(values, excl_slice)
where = values != 0
# first slice down to len(where) because we might not have
# written data for all the minutes requested
out[:len(where), i][where] = values[where]
if field != 'volume':
out *= self._ohlc_inverse
if field != 'volume':
out[:len(where), i][where] = (
values[where] * self._ohlc_ratio_inverse_for_sid(sid))
else:
out[:len(where), i][where] = values[where]
results.append(out)
return results