ENH: Pass calendar instance to BcolzMinuteBarWriter (#1406)

* First pass.

* Improvements and fixes

- Update usages of BcolzMinuteBarWriter
- Updates with rebuilt example data
- Expose calendar from BcolzMinuteBarMetadata instead of calendar_name
- Keep market_opens and market_closes in metadata for compatibility

* Store start_session and end_session in minute bcolz metadata

- start_session replaces first_trading_day
- Add end_session to limit to correct days

* For last_available_dt, get last close from calendar to maintain tz

* Bumps version and handles earlier versionson read

* Rebuilt example data on python 3

* Indicate metadata fields that are deprecated
This commit is contained in:
Andrew Daniels
2016-08-18 15:41:26 -04:00
committed by GitHub
parent ab1b485393
commit 37e6a48e99
7 changed files with 146 additions and 117 deletions
+3 -3
View File
@@ -80,10 +80,10 @@ class BcolzMinuteBarTestCase(WithTradingCalendars,
self.dest = self.instance_tmpdir.getpath('minute_bars')
os.makedirs(self.dest)
self.writer = BcolzMinuteBarWriter(
TEST_CALENDAR_START,
self.dest,
self.market_opens,
self.market_closes,
self.trading_calendar,
TEST_CALENDAR_START,
TEST_CALENDAR_STOP,
US_EQUITIES_MINUTES_PER_DAY,
)
self.reader = BcolzMinuteBarReader(self.dest)
Binary file not shown.
+3 -9
View File
@@ -3532,12 +3532,6 @@ class TestEquityAutoClose(WithTmpDir, WithTradingCalendars, ZiplineTestCase):
sids = asset_info.index
env = self.enter_instance_context(tmp_trading_env(equities=asset_info))
market_opens = self.trading_calendar.schedule.market_open.loc[
self.test_days
]
market_closes = self.trading_calendar.schedule.market_close.loc[
self.test_days
]
if frequency == 'daily':
dates = self.test_days
@@ -3569,10 +3563,10 @@ class TestEquityAutoClose(WithTmpDir, WithTradingCalendars, ZiplineTestCase):
self.test_days[-1],
)
writer = BcolzMinuteBarWriter(
self.test_days[0],
self.tmpdir.path,
market_opens,
market_closes,
self.trading_calendar,
self.test_days[0],
self.test_days[-1],
US_EQUITIES_MINUTES_PER_DAY
)
trade_data_by_sid = make_trade_data_for_asset_info(
+3 -3
View File
@@ -381,12 +381,12 @@ def _make_bundle_core():
daily_bar_writer.write(())
minute_bar_writer = BcolzMinuteBarWriter(
bundle.start_session,
wd.ensure_dir(*minute_equity_relative(
name, timestr, environ=environ)
),
bundle.calendar.schedule['market_open'],
bundle.calendar.schedule['market_close'],
bundle.calendar,
bundle.start_session,
bundle.end_session,
minutes_per_day=bundle.minutes_per_day,
)
asset_db_writer = AssetDBWriter(
+128 -90
View File
@@ -32,6 +32,7 @@ from zipline.data._minute_bar_internal import (
)
from zipline.gens.sim_engine import NANOS_IN_MINUTE
from zipline.utils.calendars import get_calendar
from zipline.utils.cli import maybe_show_progress
from zipline.utils.memoize import lazyval
@@ -195,20 +196,19 @@ class BcolzMinuteBarMetadata(object):
"""
Parameters
----------
first_trading_day : datetime-like
UTC midnight of the first day available in the dataset.
minute_index : pd.DatetimeIndex
The minutes which act as an index into the corresponding values
written into each sid's ctable.
market_opens : pd.DatetimeIndex
The market opens for each day in the data set. (Not yet required.)
market_closes : pd.DatetimeIndex
The market closes for each day in the data set. (Not yet required.)
ohlc_ratio : int
The factor by which the pricing data is multiplied so that the
float data can be stored as an integer.
calendar : zipline.utils.calendars.trading_calendar.TradingCalendar
The TradingCalendar on which the minute bars are based.
start_session : datetime
The first trading session in the data set.
end_session : datetime
The last trading session in the data set.
minutes_per_day : int
The number of minutes per each period.
"""
FORMAT_VERSION = 1
FORMAT_VERSION = 2
METADATA_FILENAME = 'metadata.json'
@@ -229,41 +229,50 @@ class BcolzMinuteBarMetadata(object):
# if version does not match.
version = 0
first_trading_day = pd.Timestamp(
raw_data['first_trading_day'], tz='UTC')
market_opens = pd.to_datetime(raw_data['market_opens'],
unit='m',
utc=True)
market_closes = pd.to_datetime(raw_data['market_closes'],
unit='m',
utc=True)
ohlc_ratio = raw_data['ohlc_ratio']
if version == 0:
if version >= 1:
minutes_per_day = raw_data['minutes_per_day']
else:
# version 0 always assumed US equities.
minutes_per_day = US_EQUITIES_MINUTES_PER_DAY
if version >= 2:
calendar = get_calendar(raw_data['calendar_name'])
start_session = pd.Timestamp(
raw_data['start_session'], tz='UTC')
end_session = pd.Timestamp(raw_data['end_session'], tz='UTC')
else:
minutes_per_day = raw_data['minutes_per_day']
# No calendar info included in older versions, so
# default to NYSE.
calendar = get_calendar('NYSE')
start_session = pd.Timestamp(
raw_data['first_trading_day'], tz='UTC')
end_session = calendar.minute_to_session_label(
pd.Timestamp(
raw_data['market_closes'][-1], unit='m', tz='UTC')
)
return cls(
first_trading_day,
market_opens,
market_closes,
ohlc_ratio,
calendar,
start_session,
end_session,
minutes_per_day,
)
def __init__(
self,
first_trading_day,
market_opens,
market_closes,
ohlc_ratio,
calendar,
start_session,
end_session,
minutes_per_day,
):
self.first_trading_day = first_trading_day
self.market_opens = market_opens
self.market_closes = market_closes
self.calendar = calendar
self.start_session = start_session
self.end_session = end_session
self.ohlc_ratio = ohlc_ratio
self.minutes_per_day = minutes_per_day
@@ -272,27 +281,61 @@ class BcolzMinuteBarMetadata(object):
Write the metadata to a JSON file in the rootdir.
Values contained in the metadata are:
first_trading_day : string
'YYYY-MM-DD' formatted representation of the first trading day
available in the dataset.
minute_index : list of integers
nanosecond integer representation of the minutes, the enumeration
of which corresponds to the values in each bcolz carray.
version : int
The value of FORMAT_VERSION of this class.
ohlc_ratio : int
The factor by which the pricing data is multiplied so that the
float data can be stored as an integer.
minutes_per_day : int
The number of minutes per each period.
calendar_name : str
The name of the TradingCalendar on which the minute bars are
based.
start_session : datetime
'YYYY-MM-DD' formatted representation of the first trading
session in the data set.
end_session : datetime
'YYYY-MM-DD' formatted representation of the last trading
session in the data set.
Deprecated, but included for backwards compatibility:
first_trading_day : string
'YYYY-MM-DD' formatted representation of the first trading day
available in the dataset.
market_opens : list
List of int64 values representing UTC market opens as
minutes since epoch.
market_closes : list
List of int64 values representing UTC market closes as
minutes since epoch.
"""
calendar = self.calendar
slicer = calendar.schedule.index.slice_indexer(
self.start_session,
self.end_session,
)
schedule = calendar.schedule[slicer]
market_opens = schedule.market_open
market_closes = schedule.market_close
metadata = {
'version': self.FORMAT_VERSION,
'first_trading_day': str(self.first_trading_day.date()),
'market_opens': self.market_opens.values.
astype('datetime64[m]').
astype(np.int64).tolist(),
'market_closes': self.market_closes.values.
astype('datetime64[m]').
astype(np.int64).tolist(),
'ohlc_ratio': self.ohlc_ratio,
'minutes_per_day': self.minutes_per_day
'minutes_per_day': self.minutes_per_day,
'calendar_name': self.calendar.name,
'start_session': str(self.start_session.date()),
'end_session': str(self.end_session.date()),
# Write these values for backwards compatibility
'first_trading_day': str(self.start_session.date()),
'market_opens': (
market_opens.values.astype('datetime64[m]').
astype(np.int64).tolist()),
'market_closes': (
market_closes.values.astype('datetime64[m]').
astype(np.int64).tolist()),
}
with open(self.metadata_path(rootdir), 'w+') as fp:
json.dump(metadata, fp)
@@ -304,35 +347,21 @@ class BcolzMinuteBarWriter(object):
Parameters
----------
first_trading_day : datetime
The first trading day in the data set.
rootdir : string
Path to the root directory into which to write the metadata and
bcolz subdirectories.
market_opens : pd.Series
The market opens used as a starting point for each periodic span of
minutes in the index.
The index of the series is expected to be a DatetimeIndex of the
UTC midnight of each trading day.
The values are datetime64-like UTC market opens for each day in the
index.
market_closes : pd.Series
The market closes that correspond with the market opens,
The index of the series is expected to be a DatetimeIndex of the
UTC midnight of each trading day.
The values are datetime64-like UTC market opens for each day in the
index.
The closes are written so that the reader can filter out non-market
minutes even though the tail end of early closes are written in
the data arrays to keep a regular shape.
calendar : zipline.utils.calendars.trading_calendar.TradingCalendar
The trading calendar on which to base the minute bars. Used to
get the market opens used as a starting point for each periodic
span of minutes in the index, and the market closes that
correspond with the market opens.
minutes_per_day : int
The number of minutes per each period. Defaults to 390, the mode
of minutes in NYSE trading days.
start_session : datetime
The first trading session in the data set.
end_session : datetime
The last trading session in the data set.
ohlc_ratio : int, optional
The ratio by which to multiply the pricing data to convert the
floats from floats to an integer to fit within the np.uint32.
@@ -397,40 +426,41 @@ class BcolzMinuteBarWriter(object):
COL_NAMES = ('open', 'high', 'low', 'close', 'volume')
def __init__(self,
first_trading_day,
rootdir,
market_opens,
market_closes,
calendar,
start_session,
end_session,
minutes_per_day,
ohlc_ratio=OHLC_RATIO,
expectedlen=DEFAULT_EXPECTEDLEN):
self._rootdir = rootdir
self._first_trading_day = first_trading_day
self._market_opens = market_opens[
market_opens.index.slice_indexer(start=self._first_trading_day)]
self._market_closes = market_closes[
market_closes.index.slice_indexer(start=self._first_trading_day)]
self._trading_days = self._market_opens.index
self._start_session = start_session
self._end_session = end_session
self._calendar = calendar
slicer = (
calendar.schedule.index.slice_indexer(start_session, end_session))
self._schedule = calendar.schedule[slicer]
self._session_labels = self._schedule.index
self._minutes_per_day = minutes_per_day
self._expectedlen = expectedlen
self._ohlc_ratio = ohlc_ratio
self._minute_index = _calc_minute_index(
self._market_opens, self._minutes_per_day)
self._schedule.market_open, self._minutes_per_day)
metadata = BcolzMinuteBarMetadata(
self._first_trading_day,
self._market_opens,
self._market_closes,
self._ohlc_ratio,
self._calendar,
self._start_session,
self._end_session,
self._minutes_per_day,
)
metadata.write(self._rootdir)
@property
def first_trading_day(self):
return self._first_trading_day
return self._start_session
def sidpath(self, sid):
"""
@@ -470,7 +500,7 @@ class BcolzMinuteBarWriter(object):
if num_days == 0:
# empty container
return pd.NaT
return self._trading_days[num_days - 1]
return self._session_labels[num_days - 1]
def _init_ctable(self, path):
"""
@@ -552,7 +582,7 @@ class BcolzMinuteBarWriter(object):
last_date = self.last_date_in_output_for_sid(sid)
tds = self._trading_days
tds = self._session_labels
if date <= last_date or date < tds[0]:
# No need to pad.
@@ -695,7 +725,7 @@ class BcolzMinuteBarWriter(object):
"""
table = self._ensure_ctable(sid)
tds = self._trading_days
tds = self._session_labels
input_first_day = pd.Timestamp(dts[0].astype('datetime64[D]'),
tz='UTC')
@@ -783,13 +813,20 @@ class BcolzMinuteBarReader(MinuteBarReader):
metadata = self._get_metadata()
self._first_trading_day = metadata.first_trading_day
self._start_session = metadata.start_session
self._end_session = metadata.end_session
self._market_opens = metadata.market_opens
self._market_open_values = metadata.market_opens.values.\
self.calendar = metadata.calendar
slicer = self.calendar.schedule.index.slice_indexer(
self._start_session,
self._end_session,
)
self._schedule = self.calendar.schedule[slicer]
self._market_opens = self._schedule.market_open
self._market_open_values = self._market_opens.values.\
astype('datetime64[m]').astype(np.int64)
self._market_closes = metadata.market_closes
self._market_close_values = metadata.market_closes.values.\
self._market_closes = self._schedule.market_close
self._market_close_values = self._market_closes.values.\
astype('datetime64[m]').astype(np.int64)
self._ohlc_inverse = 1.0 / metadata.ohlc_ratio
@@ -809,11 +846,12 @@ class BcolzMinuteBarReader(MinuteBarReader):
@lazyval
def last_available_dt(self):
return self._market_closes[-1]
_, close = self.calendar.open_and_close_for_session(self._end_session)
return close
@property
def first_trading_day(self):
return self._first_trading_day
return self._start_session
def _minutes_to_exclude(self):
"""
+3 -6
View File
@@ -507,14 +507,11 @@ def create_data_portal(asset_finder, tempdir, sim_params, sids,
def write_bcolz_minute_data(trading_calendar, days, path, data):
market_opens = trading_calendar.schedule.loc[days].market_open
market_closes = trading_calendar.schedule.loc[days].market_close
BcolzMinuteBarWriter(
days[0],
path,
market_opens,
market_closes,
trading_calendar,
days[0],
days[-1],
US_EQUITIES_MINUTES_PER_DAY
).write(data)
+6 -6
View File
@@ -995,10 +995,10 @@ class WithBcolzEquityMinuteBarReader(WithEquityMinuteBarData, WithTmpDir):
days = cls.equity_minute_bar_days
writer = BcolzMinuteBarWriter(
days[0],
p,
cls.trading_calendar.schedule.market_open.loc[days],
cls.trading_calendar.schedule.market_close.loc[days],
cls.trading_calendar,
days[0],
days[-1],
US_EQUITIES_MINUTES_PER_DAY
)
writer.write(cls.make_equity_minute_bar_data())
@@ -1055,10 +1055,10 @@ class WithBcolzFutureMinuteBarReader(WithFutureMinuteBarData, WithTmpDir):
days = cls.future_minute_bar_days
writer = BcolzMinuteBarWriter(
days[0],
p,
trading_calendar.schedule.market_open.loc[days],
trading_calendar.schedule.market_close.loc[days],
trading_calendar,
days[0],
days[-1],
FUTURES_MINUTES_PER_DAY,
)
writer.write(cls.make_future_minute_bar_data())