From 012888fcc7f2e5f3f705f75d850800878eee91ab Mon Sep 17 00:00:00 2001 From: Andrew Daniels Date: Wed, 24 Aug 2016 13:14:16 -0400 Subject: [PATCH] MAINT: Modifies minute bars to use a dict of OHLC ratios (#1428) For scaling up pricing data before writing to bcolz, the writer now accepts a dict mapping each sid to the ratio to use. It still accepts a single ratio as default_ohlc_ratio, which is used as a fallback if no mapping exists for a given sid. The default is OHLC_RATIO (1000). This allows better handling of futures pricing data, where the required precision across root symbols is not consistent. --- zipline/data/minute_bars.py | 103 ++++++++++++++++++++++++++++-------- 1 file changed, 80 insertions(+), 23 deletions(-) diff --git a/zipline/data/minute_bars.py b/zipline/data/minute_bars.py index de1e67d6..de5de473 100644 --- a/zipline/data/minute_bars.py +++ b/zipline/data/minute_bars.py @@ -24,6 +24,7 @@ from intervaltree import IntervalTree import numpy as np import pandas as pd from six import with_metaclass +from toolz import keymap, valmap from zipline.data._minute_bar_internal import ( minute_value, @@ -208,7 +209,7 @@ class BcolzMinuteBarMetadata(object): minutes_per_day : int The number of minutes per each period. """ - FORMAT_VERSION = 2 + FORMAT_VERSION = 3 METADATA_FILENAME = 'metadata.json' @@ -229,7 +230,7 @@ class BcolzMinuteBarMetadata(object): # if version does not match. version = 0 - ohlc_ratio = raw_data['ohlc_ratio'] + default_ohlc_ratio = raw_data['ohlc_ratio'] if version >= 1: minutes_per_day = raw_data['minutes_per_day'] @@ -254,8 +255,16 @@ class BcolzMinuteBarMetadata(object): raw_data['market_closes'][-1], unit='m', tz='UTC') ) + if version >= 3: + ohlc_ratios_per_sid = raw_data['ohlc_ratios_per_sid'] + if ohlc_ratios_per_sid is not None: + ohlc_ratios_per_sid = keymap(int, ohlc_ratios_per_sid) + else: + ohlc_ratios_per_sid = None + return cls( - ohlc_ratio, + default_ohlc_ratio, + ohlc_ratios_per_sid, calendar, start_session, end_session, @@ -265,7 +274,8 @@ class BcolzMinuteBarMetadata(object): def __init__( self, - ohlc_ratio, + default_ohlc_ratio, + ohlc_ratios_per_sid, calendar, start_session, end_session, @@ -275,7 +285,8 @@ class BcolzMinuteBarMetadata(object): self.calendar = calendar self.start_session = start_session self.end_session = end_session - self.ohlc_ratio = ohlc_ratio + self.default_ohlc_ratio = default_ohlc_ratio + self.ohlc_ratios_per_sid = ohlc_ratios_per_sid self.minutes_per_day = minutes_per_day self.version = version @@ -288,8 +299,14 @@ class BcolzMinuteBarMetadata(object): version : int The value of FORMAT_VERSION of this class. ohlc_ratio : int - The factor by which the pricing data is multiplied so that the - float data can be stored as an integer. + The default ratio by which to multiply the pricing data to + convert the floats from floats to an integer to fit within + the np.uint32. If ohlc_ratios_per_sid is None or does not + contain a mapping for a given sid, this ratio is used. + ohlc_ratios_per_sid : dict + A dict mapping each sid in the output to the factor by + which the pricing data is multiplied so that the float data + can be stored as an integer. minutes_per_day : int The number of minutes per each period. calendar_name : str @@ -326,7 +343,8 @@ class BcolzMinuteBarMetadata(object): metadata = { 'version': self.version, - 'ohlc_ratio': self.ohlc_ratio, + 'ohlc_ratio': self.default_ohlc_ratio, + 'ohlc_ratios_per_sid': self.ohlc_ratios_per_sid, 'minutes_per_day': self.minutes_per_day, 'calendar_name': self.calendar.name, 'start_session': str(self.start_session.date()), @@ -365,12 +383,15 @@ class BcolzMinuteBarWriter(object): The first trading session in the data set. end_session : datetime The last trading session in the data set. - ohlc_ratio : int, optional - The ratio by which to multiply the pricing data to convert the - floats from floats to an integer to fit within the np.uint32. - - The default is 1000 to support pricing data which comes in to the - thousands place. + default_ohlc_ratio : int, optional + The default ratio by which to multiply the pricing data to + convert from floats to integers that fit within np.uint32. If + ohlc_ratios_per_sid is None or does not contain a mapping for a + given sid, this ratio is used. Default is OHLC_RATIO (1000). + ohlc_ratios_per_sid : dict, optional + A dict mapping each sid in the output to the ratio by which to + multiply the pricing data to convert the floats from floats to + an integer to fit within the np.uint32. expectedlen : int, optional The expected length of the dataset, used when creating the initial bcolz ctable. @@ -434,7 +455,8 @@ class BcolzMinuteBarWriter(object): start_session, end_session, minutes_per_day, - ohlc_ratio=OHLC_RATIO, + default_ohlc_ratio=OHLC_RATIO, + ohlc_ratios_per_sid=None, expectedlen=DEFAULT_EXPECTEDLEN): self._rootdir = rootdir @@ -447,13 +469,15 @@ class BcolzMinuteBarWriter(object): self._session_labels = self._schedule.index self._minutes_per_day = minutes_per_day self._expectedlen = expectedlen - self._ohlc_ratio = ohlc_ratio + self._default_ohlc_ratio = default_ohlc_ratio + self._ohlc_ratios_per_sid = ohlc_ratios_per_sid self._minute_index = _calc_minute_index( self._schedule.market_open, self._minutes_per_day) metadata = BcolzMinuteBarMetadata( - self._ohlc_ratio, + self._default_ohlc_ratio, + self._ohlc_ratios_per_sid, self._calendar, self._start_session, self._end_session, @@ -465,6 +489,17 @@ class BcolzMinuteBarWriter(object): def first_trading_day(self): return self._start_session + def ohlc_ratio_for_sid(self, sid): + if self._ohlc_ratios_per_sid is not None: + try: + return self._ohlc_ratios_per_sid[sid] + except KeyError: + pass + + # If no ohlc_ratios_per_sid dict is passed, or if the specified + # sid is not in the dict, fallback to the general ohlc_ratio. + return self._default_ohlc_ratio + def sidpath(self, sid): """ Parameters: @@ -772,7 +807,7 @@ class BcolzMinuteBarWriter(object): dt_ixs = np.searchsorted(all_minutes_in_window.values, dts.astype('datetime64[ns]')) - ohlc_ratio = self._ohlc_ratio + ohlc_ratio = self.ohlc_ratio_for_sid(sid) def convert_col(col): """Adapt float column into a uint32 column. @@ -832,7 +867,13 @@ class BcolzMinuteBarReader(MinuteBarReader): self._market_close_values = self._market_closes.values.\ astype('datetime64[m]').astype(np.int64) - self._ohlc_inverse = 1.0 / metadata.ohlc_ratio + self._default_ohlc_inverse = 1.0 / metadata.default_ohlc_ratio + ohlc_ratios = metadata.ohlc_ratios_per_sid + if ohlc_ratios: + self._ohlc_inverses_per_sid = ( + valmap(lambda x: 1.0 / x, ohlc_ratios)) + else: + self._ohlc_inverses_per_sid = None self._minutes_per_day = metadata.minutes_per_day @@ -856,6 +897,17 @@ class BcolzMinuteBarReader(MinuteBarReader): def first_trading_day(self): return self._start_session + def _ohlc_ratio_inverse_for_sid(self, sid): + if self._ohlc_inverses_per_sid is not None: + try: + return self._ohlc_inverses_per_sid[sid] + except KeyError: + pass + + # If we can not get a sid-specific OHLC inverse for this sid, + # fallback to the default. + return self._default_ohlc_inverse + def _minutes_to_exclude(self): """ Calculate the minutes which should be excluded when a window @@ -999,8 +1051,9 @@ class BcolzMinuteBarReader(MinuteBarReader): return 0 else: return np.nan + if field != 'volume': - value *= self._ohlc_inverse + value *= self._ohlc_ratio_inverse_for_sid(sid) return value def get_last_traded_dt(self, asset, dt): @@ -1111,11 +1164,15 @@ class BcolzMinuteBarReader(MinuteBarReader): excl_slice = np.s_[ excl_start - start_idx:excl_stop - start_idx + 1] values = np.delete(values, excl_slice) + where = values != 0 # first slice down to len(where) because we might not have # written data for all the minutes requested - out[:len(where), i][where] = values[where] - if field != 'volume': - out *= self._ohlc_inverse + if field != 'volume': + out[:len(where), i][where] = ( + values[where] * self._ohlc_ratio_inverse_for_sid(sid)) + else: + out[:len(where), i][where] = values[where] + results.append(out) return results