# Copyright 2016 Quantopian, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import OrderedDict from abc import ABCMeta, abstractmethod import numpy as np from numpy import nan import pandas as pd from pandas import DataFrame from six import with_metaclass from zipline.data.minute_bars import MinuteBarReader from zipline.data.us_equity_pricing import NoDataOnDate from zipline.data.session_bars import SessionBarReader from zipline.utils.memoize import lazyval _MINUTE_TO_SESSION_OHCLV_HOW = OrderedDict(( ('open', 'first'), ('high', 'max'), ('low', 'min'), ('close', 'last'), ('volume', 'sum'), )) def minute_to_session(minute_frame, calendar): """ Resample a DataFrame with minute data into the frame expected by a BcolzDailyBarWriter. Parameters ---------- minute_frame : pd.DataFrame A DataFrame with the columns `open`, `high`, `low`, `close`, `volume`, and `dt` (minute dts) calendar : zipline.utils.calendars.trading_calendar.TradingCalendar A TradingCalendar on which session labels to resample from minute to session. Return ------ session_frame : pd.DataFrame A DataFrame with the columns `open`, `high`, `low`, `close`, `volume`, and `day` (datetime-like). """ how = OrderedDict((c, _MINUTE_TO_SESSION_OHCLV_HOW[c]) for c in minute_frame.columns) return minute_frame.groupby(calendar.minute_to_session_label).agg( how) class DailyHistoryAggregator(object): """ Converts minute pricing data into a daily summary, to be used for the last slot in a call to history with a frequency of `1d`. This summary is the same as a daily bar rollup of minute data, with the distinction that the summary is truncated to the `dt` requested. i.e. the aggregation slides forward during a the course of simulation day. Provides aggregation for `open`, `high`, `low`, `close`, and `volume`. The aggregation rules for each price type is documented in their respective """ def __init__(self, market_opens, minute_reader, trading_calendar): self._market_opens = market_opens self._minute_reader = minute_reader self._trading_calendar = trading_calendar # The caches are structured as (date, market_open, entries), where # entries is a dict of asset -> (last_visited_dt, value) # # Whenever an aggregation method determines the current value, # the entry for the respective asset should be overwritten with a new # entry for the current dt.value (int) and aggregation value. # # When the requested dt's date is different from date the cache is # flushed, so that the cache entries do not grow unbounded. # # Example cache: # cache = (date(2016, 3, 17), # pd.Timestamp('2016-03-17 13:31', tz='UTC'), # { # 1: (1458221460000000000, np.nan), # 2: (1458221460000000000, 42.0), # }) self._caches = { 'open': None, 'high': None, 'low': None, 'close': None, 'volume': None } # The int value is used for deltas to avoid extra computation from # creating new Timestamps. self._one_min = pd.Timedelta('1 min').value def _prelude(self, dt, field): session = self._trading_calendar.minute_to_session_label(dt) dt_value = dt.value cache = self._caches[field] if cache is None or cache[0] != session: market_open = self._market_opens.loc[session] cache = self._caches[field] = (session, market_open, {}) _, market_open, entries = cache market_open = market_open.tz_localize('UTC') if dt != market_open: prev_dt = dt_value - self._one_min else: prev_dt = None return market_open, prev_dt, dt_value, entries def opens(self, assets, dt): """ The open field's aggregation returns the first value that occurs for the day, if there has been no data on or before the `dt` the open is `nan`. Once the first non-nan open is seen, that value remains constant per asset for the remainder of the day. Returns ------- np.array with dtype=float64, in order of assets parameter. """ market_open, prev_dt, dt_value, entries = self._prelude(dt, 'open') opens = [] session_label = self._trading_calendar.minute_to_session_label(dt) for asset in assets: if not asset.is_alive_for_session(session_label): opens.append(np.NaN) continue if prev_dt is None: val = self._minute_reader.get_value(asset, dt, 'open') entries[asset] = (dt_value, val) opens.append(val) continue else: try: last_visited_dt, first_open = entries[asset] if last_visited_dt == dt_value: opens.append(first_open) continue elif not pd.isnull(first_open): opens.append(first_open) entries[asset] = (dt_value, first_open) continue else: after_last = pd.Timestamp( last_visited_dt + self._one_min, tz='UTC') window = self._minute_reader.load_raw_arrays( ['open'], after_last, dt, [asset], )[0] nonnan = window[~pd.isnull(window)] if len(nonnan): val = nonnan[0] else: val = np.nan entries[asset] = (dt_value, val) opens.append(val) continue except KeyError: window = self._minute_reader.load_raw_arrays( ['open'], market_open, dt, [asset], )[0] nonnan = window[~pd.isnull(window)] if len(nonnan): val = nonnan[0] else: val = np.nan entries[asset] = (dt_value, val) opens.append(val) continue return np.array(opens) def highs(self, assets, dt): """ The high field's aggregation returns the largest high seen between the market open and the current dt. If there has been no data on or before the `dt` the high is `nan`. Returns ------- np.array with dtype=float64, in order of assets parameter. """ market_open, prev_dt, dt_value, entries = self._prelude(dt, 'high') highs = [] session_label = self._trading_calendar.minute_to_session_label(dt) for asset in assets: if not asset.is_alive_for_session(session_label): highs.append(np.NaN) continue if prev_dt is None: val = self._minute_reader.get_value(asset, dt, 'high') entries[asset] = (dt_value, val) highs.append(val) continue else: try: last_visited_dt, last_max = entries[asset] if last_visited_dt == dt_value: highs.append(last_max) continue elif last_visited_dt == prev_dt: curr_val = self._minute_reader.get_value( asset, dt, 'high') if pd.isnull(curr_val): val = last_max elif pd.isnull(last_max): val = curr_val else: val = max(last_max, curr_val) entries[asset] = (dt_value, val) highs.append(val) continue else: after_last = pd.Timestamp( last_visited_dt + self._one_min, tz='UTC') window = self._minute_reader.load_raw_arrays( ['high'], after_last, dt, [asset], )[0].T val = np.nanmax(np.append(window, last_max)) entries[asset] = (dt_value, val) highs.append(val) continue except KeyError: window = self._minute_reader.load_raw_arrays( ['high'], market_open, dt, [asset], )[0].T val = np.nanmax(window) entries[asset] = (dt_value, val) highs.append(val) continue return np.array(highs) def lows(self, assets, dt): """ The low field's aggregation returns the smallest low seen between the market open and the current dt. If there has been no data on or before the `dt` the low is `nan`. Returns ------- np.array with dtype=float64, in order of assets parameter. """ market_open, prev_dt, dt_value, entries = self._prelude(dt, 'low') lows = [] session_label = self._trading_calendar.minute_to_session_label(dt) for asset in assets: if not asset.is_alive_for_session(session_label): lows.append(np.NaN) continue if prev_dt is None: val = self._minute_reader.get_value(asset, dt, 'low') entries[asset] = (dt_value, val) lows.append(val) continue else: try: last_visited_dt, last_min = entries[asset] if last_visited_dt == dt_value: lows.append(last_min) continue elif last_visited_dt == prev_dt: curr_val = self._minute_reader.get_value( asset, dt, 'low') val = np.nanmin([last_min, curr_val]) entries[asset] = (dt_value, val) lows.append(val) continue else: after_last = pd.Timestamp( last_visited_dt + self._one_min, tz='UTC') window = self._minute_reader.load_raw_arrays( ['low'], after_last, dt, [asset], )[0].T val = np.nanmin(np.append(window, last_min)) entries[asset] = (dt_value, val) lows.append(val) continue except KeyError: window = self._minute_reader.load_raw_arrays( ['low'], market_open, dt, [asset], )[0].T val = np.nanmin(window) entries[asset] = (dt_value, val) lows.append(val) continue return np.array(lows) def closes(self, assets, dt): """ The close field's aggregation returns the latest close at the given dt. If the close for the given dt is `nan`, the most recent non-nan `close` is used. If there has been no data on or before the `dt` the close is `nan`. Returns ------- np.array with dtype=float64, in order of assets parameter. """ market_open, prev_dt, dt_value, entries = self._prelude(dt, 'close') closes = [] session_label = self._trading_calendar.minute_to_session_label(dt) for asset in assets: if not asset.is_alive_for_session(session_label): closes.append(np.NaN) continue if prev_dt is None: val = self._minute_reader.get_value(asset, dt, 'close') entries[asset] = (dt_value, val) closes.append(val) continue else: try: last_visited_dt, last_close = entries[asset] if last_visited_dt == dt_value: closes.append(last_close) continue elif last_visited_dt == prev_dt: val = self._minute_reader.get_value( asset, dt, 'close') if pd.isnull(val): val = last_close entries[asset] = (dt_value, val) closes.append(val) continue else: val = self._minute_reader.get_value( asset, dt, 'close') if pd.isnull(val): val = self.closes( [asset], pd.Timestamp(prev_dt, tz='UTC'))[0] entries[asset] = (dt_value, val) closes.append(val) continue except KeyError: val = self._minute_reader.get_value( asset, dt, 'close') if pd.isnull(val): val = self.closes([asset], pd.Timestamp(prev_dt, tz='UTC'))[0] entries[asset] = (dt_value, val) closes.append(val) continue return np.array(closes) def volumes(self, assets, dt): """ The volume field's aggregation returns the sum of all volumes between the market open and the `dt` If there has been no data on or before the `dt` the volume is 0. Returns ------- np.array with dtype=int64, in order of assets parameter. """ market_open, prev_dt, dt_value, entries = self._prelude(dt, 'volume') volumes = [] session_label = self._trading_calendar.minute_to_session_label(dt) for asset in assets: if not asset.is_alive_for_session(session_label): volumes.append(0) continue if prev_dt is None: val = self._minute_reader.get_value(asset, dt, 'volume') entries[asset] = (dt_value, val) volumes.append(val) continue else: try: last_visited_dt, last_total = entries[asset] if last_visited_dt == dt_value: volumes.append(last_total) continue elif last_visited_dt == prev_dt: val = self._minute_reader.get_value( asset, dt, 'volume') val += last_total entries[asset] = (dt_value, val) volumes.append(val) continue else: after_last = pd.Timestamp( last_visited_dt + self._one_min, tz='UTC') window = self._minute_reader.load_raw_arrays( ['volume'], after_last, dt, [asset], )[0] val = np.nansum(window) + last_total entries[asset] = (dt_value, val) volumes.append(val) continue except KeyError: window = self._minute_reader.load_raw_arrays( ['volume'], market_open, dt, [asset], )[0] val = np.nansum(window) entries[asset] = (dt_value, val) volumes.append(val) continue return np.array(volumes) class MinuteResampleSessionBarReader(SessionBarReader): def __init__(self, calendar, minute_bar_reader): self._calendar = calendar self._minute_bar_reader = minute_bar_reader def _get_resampled(self, columns, start_dt, end_dt, assets): minute_data = self._minute_bar_reader.load_raw_arrays( columns, start_dt, end_dt, assets) dts = self._calendar.minutes_in_range(start_dt, end_dt) frames = [] for i, _ in enumerate(assets): minute_frame = DataFrame((d.T[i] for d in minute_data), index=columns, columns=dts).T df = minute_to_session(minute_frame, self._calendar) frames.append(df) return frames @property def trading_calendar(self): return self._calendar def load_raw_arrays(self, columns, start_dt, end_dt, sids): sessions = self._calendar.sessions_in_range(start_dt, end_dt) range_open, _ = self._calendar.open_and_close_for_session( start_dt) _, range_close = self._calendar.open_and_close_for_session( end_dt) shape = len(sessions), len(sids) results = [] for col in columns: if col != 'volume': out = np.full(shape, np.nan) else: out = np.zeros(shape, dtype=np.uint32) results.append(out) frames = self._get_resampled(columns, range_open, range_close, sids) for i, result in enumerate(results): for j, frame in enumerate(frames): result[:, j] = frame.values[:, i] return results def get_value(self, sid, session, colname): # WARNING: This will need caching or other optimization if used in a # tight loop. # This was developed to complete interface, but has not been tuned # for real world use. start, end = self._calendar.open_and_close_for_session(session) frame = self._get_resampled([colname], start, end, [sid])[0] return frame.loc[session, colname] @lazyval def sessions(self): cal = self._calendar first = self._minute_bar_reader.first_trading_day last = cal.minute_to_session_label( self._minute_bar_reader.last_available_dt) return cal.sessions_in_range(first, last) @lazyval def last_available_dt(self): return self.trading_calendar.minute_to_session_label( self._minute_bar_reader.last_available_dt ) @property def first_trading_day(self): return self._minute_bar_reader.first_trading_day class ReindexBarReader(with_metaclass(ABCMeta)): """ A base class for readers which reindexes results, filling in the additional indices with empty data. Used to align the reading assets which trade on different calendars. Currently only supports a ``trading_calendar`` which is a superset of the ``reader``'s calendar. Also, the currenty implementation only reindexes the results from ``load_raw_arrays``, but in the future, `get_value` may also be made to provide an empty result instead of raising on error. Parameters ---------- - trading_calendar : zipline.utils.trading_calendar.TradingCalendar The calendar to use when indexing results from the reader. - reader : MinuteBarReader|SessionBarReader The reader which has a calendar that is a subset of the desired ``trading_calendar``. - first_trading_session : pd.Timestamp The first trading session the reader should provide. Must be specified, since the ``reader``'s first session may not exactly align with the desired calendar. Specifically, in the case where the first session on the target calendar is a holiday on the ``reader``'s calendar. - last_trading_session : pd.Timestamp The last trading session the reader should provide. Must be specified, since the ``reader``'s last session may not exactly align with the desired calendar. Specifically, in the case where the last session on the target calendar is a holiday on the ``reader``'s calendar. """ def __init__(self, trading_calendar, reader, first_trading_session, last_trading_session): self._trading_calendar = trading_calendar self._reader = reader self._first_trading_session = first_trading_session self._last_trading_session = last_trading_session @property def last_available_dt(self): return self._reader.last_available_dt def get_last_traded_dt(self, sid, dt): return self._reader.get_last_traded_dt(sid, dt) @property def first_trading_day(self): return self._reader.first_trading_day def get_value(self, sid, dt, field): try: return self._reader.get_value(sid, dt, field) except NoDataOnDate: if field == 'volume': return 0 else: return nan @abstractmethod def _outer_dts(self, start_dt, end_dt): raise NotImplementedError @abstractmethod def _inner_dts(self, start_dt, end_dt): raise NotImplementedError @property def trading_calendar(self): return self._trading_calendar @lazyval def sessions(self): return self.trading_calendar.sessions_in_range( self._first_trading_session, self._last_trading_session ) def load_raw_arrays(self, fields, start_dt, end_dt, sids): outer_dts = self._outer_dts(start_dt, end_dt) inner_dts = self._inner_dts(start_dt, end_dt) indices = outer_dts.searchsorted(inner_dts) shape = len(outer_dts), len(sids) outer_results = [] if len(inner_dts) > 0: inner_results = self._reader.load_raw_arrays( fields, inner_dts[0], inner_dts[-1], sids) else: inner_results = None for i, field in enumerate(fields): if field != 'volume': out = np.full(shape, np.nan) else: out = np.zeros(shape, dtype=np.uint32) if inner_results is not None: out[indices] = inner_results[i] outer_results.append(out) return outer_results class ReindexMinuteBarReader(ReindexBarReader, MinuteBarReader): """ See: ``ReindexBarReader`` """ def _outer_dts(self, start_dt, end_dt): return self._trading_calendar.minutes_in_range(start_dt, end_dt) def _inner_dts(self, start_dt, end_dt): return self._reader.calendar.minutes_in_range(start_dt, end_dt) class ReindexSessionBarReader(ReindexBarReader, SessionBarReader): """ See: ``ReindexBarReader`` """ def _outer_dts(self, start_dt, end_dt): return self.trading_calendar.sessions_in_range(start_dt, end_dt) def _inner_dts(self, start_dt, end_dt): return self._reader.trading_calendar.sessions_in_range( start_dt, end_dt)