Files
catalyst/zipline/modelling/engine.py
T
Scott Sanderson 26fd6fda8b ENH/BUG: Modeling API enhancements.
- Fixes an error where Modeling API data known as of the close of `day
  N` would be shown to algorithms during `before_trading_start` as of
  the close of the same day.  Algorithms should now only receive data
  during `before_trading_start/handle_data` that was known as of the
  simulation time at which the function would be called.

- All Term instances now have a `mask` attribute that must be a `Filter`
  or an instance of `AssetExists()`.  `mask` can be used to specify that
  a Factor should be computed in a manner that ignores the values that
  were not `True` in the mask.

- Changed the interface for `FFCLoader.load_adjusted_array` and
  `Term._compute` from `(columns, mask)`, with mask as a DataFrame, to
  `(columns, dates, assets, mask)`, where mask is a numpy array.  This
  is primarily to avoid having to reconstruct extra DataFrames when
  using masks produced by non `AssetExists` filters.

- Adds `BoundColumn.latest`, which gives the most-recently-known value
  of a column.
2015-09-16 01:47:11 -04:00

484 lines
17 KiB
Python

"""
Compute Engine for FFC API
"""
from abc import (
ABCMeta,
abstractmethod,
)
from operator import and_
from six import (
iteritems,
itervalues,
with_metaclass,
)
from six.moves import (
reduce,
zip_longest,
)
from numpy import (
add,
empty_like,
)
from pandas import (
DataFrame,
date_range,
MultiIndex,
)
from zipline.lib.adjusted_array import ensure_ndarray
from zipline.errors import NoFurtherDataError
from zipline.utils.pandas_utils import explode
from .classifier import Classifier
from .factor import Factor
from .filter import Filter
from .graph import TermGraph
from .term import AssetExists
class FFCEngine(with_metaclass(ABCMeta)):
@abstractmethod
def factor_matrix(self, terms, start_date, end_date):
"""
Compute values for `terms` between `start_date` and `end_date`.
Returns a DataFrame with a MultiIndex of (date, asset) pairs on the
index. On each date, we return a row for each asset that passed all
instances of `Filter` in `terms, and the columns of the returned frame
will be the keys in `terms` whose values are instances of `Factor`.
Parameters
----------
terms : dict[str -> zipline.modelling.term.Term]
Dict mapping term names to instances. The supplied names are used
as column names in our output frame.
start_date : pd.Timestamp
Start date of the computed matrix.
end_date : pd.Timestamp
End date of the computed matrix.
Returns
-------
matrix : pd.DataFrame
A matrix of computed results.
"""
raise NotImplementedError("factor_matrix")
class NoOpFFCEngine(FFCEngine):
"""
FFCEngine that doesn't do anything.
"""
def factor_matrix(self, terms, start_date, end_date):
return DataFrame(
index=MultiIndex.from_product(
[date_range(start=start_date, end=end_date, freq='D'), ()],
),
columns=sorted(terms.keys())
)
class SimpleFFCEngine(object):
"""
FFC Engine class that computes each term independently.
Parameters
----------
loader : FFCLoader
A loader to use to retrieve raw data for atomic terms.
calendar : DatetimeIndex
Array of dates to consider as trading days when computing a range
between a fixed start and end.
asset_finder : zipline.assets.AssetFinder
An AssetFinder instance. We depend on the AssetFinder to determine
which assets are in the top-level universe at any point in time.
"""
__slots__ = [
'_loader',
'_calendar',
'_finder',
'_root_mask_term',
'__weakref__',
]
def __init__(self, loader, calendar, asset_finder):
self._loader = loader
self._calendar = calendar
self._finder = asset_finder
self._root_mask_term = AssetExists()
def factor_matrix(self, terms, start_date, end_date):
"""
Compute a factor matrix.
Parameters
----------
terms : dict[str -> zipline.modelling.term.Term]
Dict mapping term names to instances. The supplied names are used
as column names in our output frame.
start_date : pd.Timestamp
Start date of the computed matrix.
end_date : pd.Timestamp
End date of the computed matrix.
The algorithm implemented here can be broken down into the following
stages:
0. Build a dependency graph of all terms in `terms`. Topologically
sort the graph to determine an order in which we can compute the terms.
1. Ask our AssetFinder for a "lifetimes matrix", which should contain,
for each date between start_date and end_date, a boolean value for each
known asset indicating whether the asset existed on that date.
2. Compute each term in the dependency order determined in (0), caching
the results in a a dictionary to that they can be fed into future
terms.
3. For each date, determine the number of assets passing **all**
filters. The sum, N, of all these values is the total number of rows in
our output frame, so we pre-allocate an output array of length N for
each factor in `terms`.
4. Fill in the arrays allocated in (3) by copying computed values from
our output cache into the corresponding rows.
5. Stick the values computed in (4) into a DataFrame and return it.
Step 0 is performed by `zipline.modelling.graph.TermGraph`.
Step 1 is performed in `self._compute_root_mask`.
Step 2 is performed in `self.compute_chunk`.
Steps 3, 4, and 5 are performed in self._format_factor_matrix.
See Also
--------
FFCEngine.factor_matrix
"""
if end_date <= start_date:
raise ValueError(
"start_date must be before end_date \n"
"start_date=%s, end_date=%s" % (start_date, end_date)
)
graph = TermGraph(terms)
extra_rows = graph.extra_rows[self._root_mask_term]
root_mask = self._compute_root_mask(start_date, end_date, extra_rows)
dates, assets, root_mask_values = explode(root_mask)
raw_outputs = self.compute_chunk(
graph,
dates,
assets,
initial_workspace={self._root_mask_term: root_mask_values},
)
# Collect the results that we'll actually show to the user.
filters, factors = {}, {}
for name, term in iteritems(terms):
if isinstance(term, Filter):
filters[name] = raw_outputs[name]
elif isinstance(term, Factor):
factors[name] = raw_outputs[name]
elif isinstance(term, Classifier):
continue
else:
raise ValueError("Unknown term type: %s" % term)
# Add the root mask as an implicit filter, truncating off the extra
# rows that we only needed to compute other terms.
filters['base'] = root_mask_values[extra_rows:]
out_dates = dates[extra_rows:]
return self._format_factor_matrix(out_dates, assets, filters, factors)
def _compute_root_mask(self, start_date, end_date, extra_rows):
"""
Compute a lifetimes matrix from our AssetFinder, then drop columns that
didn't exist at all during the query dates.
Parameters
----------
start_date : pd.Timestamp
Base start date for the matrix.
end_date : pd.Timestamp
End date for the matrix.
extra_rows : int
Number of extra rows to compute before `start_date`.
Extra rows are needed by terms like moving averages that require a
trailing window of data.
Returns
-------
lifetimes : pd.DataFrame
Frame of dtype `bool` containing dates from `extra_rows` days
before `start_date`, continuing through to `end_date`. The
returned frame contains as columns all assets in our AssetFinder
that existed for at least one day between `start_date` and
`end_date`.
"""
calendar = self._calendar
finder = self._finder
start_idx, end_idx = self._calendar.slice_locs(start_date, end_date)
if start_idx < extra_rows:
raise NoFurtherDataError(
msg="Insufficient data to compute FFC Matrix: "
"start date was %s, "
"earliest known date was %s, "
"and %d extra rows were requested." % (
start_date, calendar[0], extra_rows,
),
)
# Build lifetimes matrix reaching back to `extra_rows` days before
# `start_date.`
lifetimes = finder.lifetimes(
calendar[start_idx - extra_rows:end_idx],
include_start_date=False
)
assert lifetimes.index[extra_rows] == start_date
assert lifetimes.index[-1] == end_date
if not lifetimes.columns.unique:
columns = lifetimes.columns
duplicated = columns[columns.duplicated()].unique()
raise AssertionError("Duplicated sids: %d" % duplicated)
# Filter out columns that didn't exist between the requested start and
# end dates.
existed = lifetimes.iloc[extra_rows:].any()
return lifetimes.loc[:, existed]
def _mask_and_dates_for_term(self, term, workspace, graph, dates):
"""
Load mask and mask row labels for term.
"""
mask = term.mask
offset = graph.extra_rows[mask] - graph.extra_rows[term]
return workspace[mask][offset:], dates[offset:]
def _inputs_for_term(self, term, workspace, graph):
"""
Compute inputs for the given term.
This is mostly complicated by the fact that for each input we store as
many rows as will be necessary to serve **any** computation requiring
that input.
"""
offsets = graph.offset
if term.windowed:
# If term is windowed, then all input data should be instances of
# AdjustedArray.
return [
workspace[input_].traverse(
window_length=term.window_length,
offset=offsets[term, input_]
)
for input_ in term.inputs
]
# If term is not windowed, input_data may be an AdjustedArray or
# np.ndarray. Coerce the former to the latter.
out = []
for input_ in term.inputs:
input_data = ensure_ndarray(workspace[input_])
offset = offsets[term, input_]
# OPTIMIZATION: Don't make a copy by doing input_data[0:] if
# offset is zero.
if offset:
input_data = input_data[offset:]
out.append(input_data)
return out
def compute_chunk(self, graph, dates, assets, initial_workspace):
"""
Compute the FFC terms in the graph for the requested start and end
dates.
Parameters
----------
graph : zipline.modelling.graph.TermGraph
dates : pd.DatetimeIndex
Row labels for our root mask.
assets : pd.Int64Index
Column labels for our root mask.
initial_workspace : dict
Map from term -> output.
Must contain at least entry for `self._root_mask_term` whose shape
is `(len(dates), len(assets))`, but may contain additional
pre-computed terms for testing or optimization purposes.
Returns
-------
results : dict
Dictionary mapping requested results to outputs.
"""
self._validate_compute_chunk_params(dates, assets, initial_workspace)
loader = self._loader
# Copy the supplied initial workspace so we don't mutate it in place.
workspace = initial_workspace.copy()
for term in graph.ordered():
# `term` may have been supplied in `initial_workspace`, and in the
# future we may pre-compute atomic terms coming from the same
# dataset. In either case, we will already have an entry for this
# term, which we shouldn't re-compute.
if term in workspace:
continue
# Asset labels are always the same, but date labels vary by how
# many extra rows are needed.
mask, mask_dates = self._mask_and_dates_for_term(
term, workspace, graph, dates
)
if term.atomic:
# FUTURE OPTIMIZATION: Scan the resolution order for terms in
# the same dataset and load them here as well.
to_load = [term]
loaded = loader.load_adjusted_array(
to_load, mask_dates, assets, mask,
)
assert len(to_load) == len(loaded)
for loaded_term, adj_array in zip_longest(to_load, loaded):
workspace[loaded_term] = adj_array
else:
workspace[term] = term._compute(
self._inputs_for_term(term, workspace, graph),
mask_dates,
assets,
mask,
)
assert(workspace[term].shape == mask.shape)
out = {}
graph_extra_rows = graph.extra_rows
for name, term in iteritems(graph.outputs):
# Truncate off extra rows from outputs.
out[name] = workspace[term][graph_extra_rows[term]:]
return out
def _format_factor_matrix(self, dates, assets, filters, factors):
"""
Convert raw computed filters/factors into a DataFrame for public APIs.
Parameters
----------
dates : np.array[datetime64]
Row index for arrays in `filters` and `factors.`
assets : np.array[int64]
Column index for arrays in `filters` and `factors.`
filters : dict
Dict mapping filter names -> computed filters.
factors : dict
Dict mapping factor names -> computed factors.
Returns
-------
factor_matrix : pd.DataFrame
The indices of `factor_matrix` are as follows:
index : two-tiered MultiIndex of (date, asset).
For each date, we return a row for each asset that passed all
filters on that date.
columns : keys from `factor_data`
Each date/asset/factor triple contains the computed value of the given
factor on the given date for the given asset.
"""
# FUTURE OPTIMIZATION: Cythonize all of this.
# Boolean mask of values that passed all filters.
unioned = reduce(and_, itervalues(filters))
# Parallel arrays of (x,y) coords for (date, asset) pairs that passed
# all filters. Each entry here will correspond to a row in our output
# frame.
nonzero_xs, nonzero_ys = unioned.nonzero()
# Raw arrays storing (date, asset) pairs.
# These will form the index of our output frame.
raw_dates_index = empty_like(nonzero_xs, dtype='datetime64[ns]')
raw_assets_index = empty_like(nonzero_xs, dtype=int)
# Mapping from column_name -> array.
# This will be the `data` arg to our output frame.
columns = {
name: empty_like(nonzero_xs, dtype=factor.dtype)
for name, factor in iteritems(factors)
}
# We're going to iterate over `iteritems(columns)` a whole bunch of
# times down below. It's faster to construct iterate over a tuple of
# pairs.
columns_iter = tuple(iteritems(columns))
# This is tricky.
# unioned.sum(axis=1) gives us an array of the same size as `dates`
# containing, for each date, the number of assets that passed our
# filters on that date.
# Running this through add.accumulate gives us an array containing, for
# each date, the running total of the number of assets that passed our
# filters on or before that date.
# This means that (bounds[i - 1], bounds[i]) gives us the indices of
# the first and last rows in our output frame for each date in `dates`.
bounds = add.accumulate(unioned.sum(axis=1))
day_start = 0
for day_idx, day_end in enumerate(bounds):
day_bounds = slice(day_start, day_end)
column_indices = nonzero_ys[day_bounds]
raw_dates_index[day_bounds] = dates[day_idx]
raw_assets_index[day_bounds] = assets[column_indices]
for name, colarray in columns_iter:
colarray[day_bounds] = factors[name][day_idx, column_indices]
# Upper bound of current row becomes lower bound for next row.
day_start = day_end
return DataFrame(
data=columns,
index=MultiIndex.from_arrays(
[
raw_dates_index,
# FUTURE OPTIMIZATION:
# Avoid duplicate lookups by grouping and only looking up
# each unique sid once.
self._finder.retrieve_all(raw_assets_index),
],
)
).tz_localize('UTC', level=0)
def _validate_compute_chunk_params(self, dates, assets, initial_workspace):
"""
Verify that the values passed to compute_chunk are well-formed.
"""
root = self._root_mask_term
clsname = type(self).__name__
# Writing this out explicitly so this errors in testing if we change
# the name without updating this line.
compute_chunk_name = self.compute_chunk.__name__
if root not in initial_workspace:
raise AssertionError(
"root_mask values not supplied to {cls}.{method}".format(
cls=clsname,
method=compute_chunk_name,
)
)
shape = initial_workspace[root].shape
implied_shape = len(dates), len(assets)
if shape != implied_shape:
raise AssertionError(
"root_mask shape is {shape}, but received dates/assets "
"imply that shape should be {implied}".format(
shape=shape,
implied=implied_shape,
)
)