mirror of
https://github.com/wassname/catalyst.git
synced 2026-07-02 03:31:52 +08:00
484 lines
16 KiB
Python
484 lines
16 KiB
Python
"""
|
|
Compute Engine for FFC API
|
|
"""
|
|
from abc import (
|
|
ABCMeta,
|
|
abstractmethod,
|
|
)
|
|
from operator import and_
|
|
from six import (
|
|
iteritems,
|
|
with_metaclass,
|
|
)
|
|
from six.moves import (
|
|
reduce,
|
|
zip,
|
|
zip_longest,
|
|
)
|
|
|
|
from networkx import (
|
|
DiGraph,
|
|
get_node_attributes,
|
|
topological_sort,
|
|
)
|
|
from numpy import (
|
|
add,
|
|
empty_like,
|
|
)
|
|
from pandas import (
|
|
DataFrame,
|
|
date_range,
|
|
MultiIndex,
|
|
)
|
|
|
|
from zipline.lib.adjusted_array import ensure_ndarray
|
|
from zipline.errors import NoFurtherDataError
|
|
from zipline.modelling.factor import Factor
|
|
from zipline.modelling.filter import Filter
|
|
|
|
|
|
# TODO: Move this somewhere else.
|
|
class CyclicDependency(Exception):
|
|
pass
|
|
|
|
|
|
def build_dependency_graph(terms):
|
|
"""
|
|
Build a dependency graph containing the given terms and their dependencies.
|
|
|
|
Parameters
|
|
----------
|
|
terms : iterable
|
|
An iterable of zipline.modelling.term.Term.
|
|
|
|
Returns
|
|
-------
|
|
dependencies : networkx.DiGraph
|
|
A directed graph representing the dependencies of the desired inputs.
|
|
|
|
Each node in the graph has an `extra_rows` attribute, indicating how
|
|
many, if any, extra rows we should compute for the node. Extra rows
|
|
are most often needed when a term is an input to a rolling window
|
|
computation. For example, if we compute a 30 day moving average of
|
|
price from day X to day Y, we need to load price data for the range
|
|
from day (X - 29) to day Y.
|
|
"""
|
|
dependencies = DiGraph()
|
|
parents = set()
|
|
for term in terms:
|
|
_add_to_graph(
|
|
term,
|
|
dependencies,
|
|
parents,
|
|
extra_rows=0,
|
|
)
|
|
# No parents should be left between top-level terms.
|
|
assert not parents
|
|
return dependencies
|
|
|
|
|
|
def _add_to_graph(term,
|
|
dependencies,
|
|
parents,
|
|
extra_rows):
|
|
"""
|
|
Add the term and all its inputs to dependencies.
|
|
"""
|
|
# If we've seen this node already as a parent of the current traversal,
|
|
# it means we have an unsatisifiable dependency. This should only be
|
|
# possible if the term's inputs are mutated after construction.
|
|
if term in parents:
|
|
raise CyclicDependency(term)
|
|
parents.add(term)
|
|
|
|
try:
|
|
existing = dependencies.node[term]
|
|
except KeyError:
|
|
# We're not yet in the graph: add the term with the specified number of
|
|
# extra rows.
|
|
dependencies.add_node(term, extra_rows=extra_rows)
|
|
else:
|
|
# We're already in the graph because we've been traversed by
|
|
# another parent. Ensure that we have enough extra rows to satisfy
|
|
# all of our parents.
|
|
existing['extra_rows'] = max(extra_rows, existing['extra_rows'])
|
|
|
|
for subterm in term.inputs:
|
|
_add_to_graph(
|
|
subterm,
|
|
dependencies,
|
|
parents,
|
|
extra_rows=extra_rows + term.extra_input_rows,
|
|
)
|
|
dependencies.add_edge(subterm, term)
|
|
|
|
parents.remove(term)
|
|
|
|
|
|
class FFCEngine(with_metaclass(ABCMeta)):
|
|
|
|
@abstractmethod
|
|
def factor_matrix(self, terms, start_date, end_date):
|
|
"""
|
|
Compute values for `terms` between `start_date` and `end_date`.
|
|
|
|
Returns a DataFrame with a MultiIndex of (date, asset) pairs on the
|
|
index. On each date, we return a row for each asset that passed all
|
|
instances of `Filter` in `terms, and the columns of the returned frame
|
|
will be the keys in `terms` whose values are instances of `Factor`.
|
|
|
|
Parameters
|
|
----------
|
|
terms : dict
|
|
Map from str -> zipline.modelling.term.Term.
|
|
start_date : datetime
|
|
The first date of the matrix.
|
|
end_date : datetime
|
|
The last date of the matrix.
|
|
|
|
Returns
|
|
-------
|
|
matrix : pd.DataFrame
|
|
A matrix of factors
|
|
"""
|
|
raise NotImplementedError("factor_matrix")
|
|
|
|
|
|
class NoOpFFCEngine(FFCEngine):
|
|
"""
|
|
FFCEngine that doesn't do anything.
|
|
"""
|
|
|
|
def factor_matrix(self, terms, start, end):
|
|
return DataFrame(
|
|
index=MultiIndex.from_product(
|
|
[date_range(start=start, end=end, freq='D'), ()],
|
|
),
|
|
columns=sorted(terms.keys())
|
|
)
|
|
|
|
|
|
class SimpleFFCEngine(object):
|
|
"""
|
|
FFC Engine class that computes each term independently.
|
|
|
|
Parameters
|
|
----------
|
|
loader : FFCLoader
|
|
A loader to use to retrieve raw data for atomic terms.
|
|
calendar : DatetimeIndex
|
|
Array of dates to consider as trading days when computing a range
|
|
between a fixed start and end.
|
|
asset_finder : zipline.assets.AssetFinder
|
|
An AssetFinder instance. We depend on the AssetFinder to determine
|
|
which assets are in the top-level universe at any point in time.
|
|
"""
|
|
__slots__ = [
|
|
'_loader',
|
|
'_calendar',
|
|
'_finder',
|
|
'__weakref__',
|
|
]
|
|
|
|
def __init__(self, loader, calendar, asset_finder):
|
|
self._loader = loader
|
|
self._calendar = calendar
|
|
self._finder = asset_finder
|
|
|
|
def factor_matrix(self, terms, start_date, end_date):
|
|
"""
|
|
Compute a factor matrix.
|
|
|
|
Parameters
|
|
----------
|
|
terms : dict[str -> zipline.modelling.term.Term]
|
|
Dict mapping term names to instances. The supplied names are used
|
|
as column names in our output frame.
|
|
start_date : pd.Timestamp
|
|
Start date of the computed matrix.
|
|
end_date : pd.Timestamp
|
|
End date of the computed matrix.
|
|
|
|
The algorithm implemented here can be broken down into the following
|
|
stages:
|
|
|
|
0. Build a dependency graph of all terms in `terms`. Topologically
|
|
sort the graph to determine an order in which we can compute the terms.
|
|
|
|
1. Ask our AssetFinder for a "lifetimes matrix", which should contain,
|
|
for each date between start_date and end_date, a boolean value for each
|
|
known asset indicating whether the asset existed on that date.
|
|
|
|
2. Compute each term in the dependency order determined in (0), caching
|
|
the results in a a dictionary to that they can be fed into future
|
|
terms.
|
|
|
|
3. For each date, determine the number of assets passing **all**
|
|
filters. The sum, N, of all these values is the total number of rows in
|
|
our output frame, so we pre-allocate an output array of length N for
|
|
each factor in `terms`.
|
|
|
|
4. Fill in the arrays allocated in (3) by copying computed values from
|
|
our output cache into the corresponding rows.
|
|
|
|
5. Stick the values computed in (4) into a DataFrame and return it.
|
|
|
|
Step 0 is performed in `build_dependency_graph`.
|
|
Step 1 is performed in `self.build_lifetimes_matrix`.
|
|
Step 2 is performed in `self.compute_chunk`.
|
|
Steps 3, 4, and 5 are performed in self._format_factor_matrix.
|
|
|
|
See Also
|
|
--------
|
|
FFCEngine.factor_matrix
|
|
"""
|
|
if end_date <= start_date:
|
|
raise ValueError(
|
|
"start_date must be before end_date \n"
|
|
"start_date=%s, end_date=%s" % (start_date, end_date)
|
|
)
|
|
|
|
graph = build_dependency_graph(terms.values())
|
|
ordered_terms = topological_sort(graph)
|
|
extra_row_counts = get_node_attributes(graph, 'extra_rows')
|
|
max_extra_rows = max(extra_row_counts.values())
|
|
|
|
lifetimes = self.build_lifetimes_matrix(
|
|
start_date,
|
|
end_date,
|
|
max_extra_rows,
|
|
)
|
|
lifetimes_between_dates = lifetimes[max_extra_rows:]
|
|
|
|
dates = lifetimes_between_dates.index.values
|
|
assets = lifetimes_between_dates.columns.values
|
|
|
|
raw_outputs = self.compute_chunk(
|
|
ordered_terms,
|
|
extra_row_counts,
|
|
lifetimes,
|
|
)
|
|
|
|
# We only need filters and factors to compute the final output matrix.
|
|
raw_filters = [lifetimes_between_dates.values]
|
|
raw_factors = []
|
|
factor_names = []
|
|
for name, term in iteritems(terms):
|
|
extra = extra_row_counts[term]
|
|
if isinstance(term, Factor):
|
|
factor_names.append(name)
|
|
raw_factors.append(raw_outputs[term][extra:])
|
|
|
|
elif isinstance(term, Filter):
|
|
raw_filters.append(raw_outputs[term][extra:])
|
|
|
|
return self._format_factor_matrix(
|
|
dates,
|
|
assets,
|
|
raw_filters,
|
|
raw_factors,
|
|
factor_names,
|
|
)
|
|
|
|
def build_lifetimes_matrix(self, start_date, end_date, extra_rows):
|
|
"""
|
|
Compute a lifetimes matrix from our AssetFinder, then drop columns that
|
|
didn't exist at all during the query dates.
|
|
|
|
Parameters
|
|
----------
|
|
start_date : pd.Timestamp
|
|
Base start date for the matrix.
|
|
end_date : pd.Timestamp
|
|
End date for the matrix.
|
|
extra_rows : int
|
|
Number of rows prior to `start_date` to include.
|
|
Extra rows are needed by terms like moving averages that require a
|
|
trailing window of data to compute.
|
|
|
|
Returns
|
|
-------
|
|
lifetimes : pd.DataFrame
|
|
Frame of dtype `bool` containing dates from `extra_rows` days
|
|
before `start_date`, continuing through to `end_date`. The
|
|
returned frame contains as columns all assets in our AssetFinder
|
|
that existed for at least one day between `start_date` and
|
|
`end_date`.
|
|
"""
|
|
calendar = self._calendar
|
|
finder = self._finder
|
|
start_idx, end_idx = self._calendar.slice_locs(start_date, end_date)
|
|
if start_idx < extra_rows:
|
|
raise NoFurtherDataError(
|
|
msg="Insufficient data to compute FFC Matrix: "
|
|
"start date was %s, "
|
|
"earliest known date was %s, "
|
|
"and %d extra rows were requested." % (
|
|
start_date, calendar[0], extra_rows,
|
|
),
|
|
)
|
|
|
|
# Build lifetimes matrix reaching back as far start_date plus
|
|
# max_extra_rows.
|
|
lifetimes = finder.lifetimes(
|
|
calendar[start_idx - extra_rows:end_idx]
|
|
)
|
|
assert lifetimes.index[extra_rows] == start_date
|
|
assert lifetimes.index[-1] == end_date
|
|
|
|
# Filter out columns that didn't exist between the requested start and
|
|
# end dates.
|
|
existed = lifetimes.iloc[extra_rows:].any()
|
|
return lifetimes.loc[:, existed]
|
|
|
|
def _inputs_for_term(self, term, workspace, extra_row_counts):
|
|
"""
|
|
Compute inputs for the given term.
|
|
|
|
This is mostly complicated by the fact that for each input we store
|
|
as many rows as will be necessary to serve any term requiring that
|
|
input. Thus if Factor A needs 5 extra rows of price, and Factor B
|
|
needs 3 extra rows of price, we need to remove 2 leading rows from our
|
|
stored prices before passing them to Factor B.
|
|
"""
|
|
term_extra_rows = term.extra_input_rows
|
|
if term.windowed:
|
|
return [
|
|
workspace[input_].traverse(
|
|
term.window_length,
|
|
offset=extra_row_counts[input_] - term_extra_rows
|
|
)
|
|
for input_ in term.inputs
|
|
]
|
|
else:
|
|
return [
|
|
ensure_ndarray(
|
|
workspace[input_][
|
|
extra_row_counts[input_] - term_extra_rows:
|
|
],
|
|
)
|
|
for input_ in term.inputs
|
|
]
|
|
|
|
def compute_chunk(self, ordered_terms, extra_row_counts, base_mask):
|
|
"""
|
|
Compute the FFC terms in the graph based on the assets and dates
|
|
defined by base_mask.
|
|
|
|
Returns a dictionary mapping terms to computed arrays.
|
|
"""
|
|
loader = self._loader
|
|
max_extra_rows = max(extra_row_counts.values())
|
|
workspace = {term: None for term in ordered_terms}
|
|
|
|
for term in ordered_terms:
|
|
base_mask_for_term = base_mask.iloc[
|
|
max_extra_rows - extra_row_counts[term]:
|
|
]
|
|
if term.atomic:
|
|
# FUTURE OPTIMIZATION: Scan the resolution order for terms in
|
|
# the same dataset and load them here as well.
|
|
to_load = [term]
|
|
loaded = loader.load_adjusted_array(
|
|
to_load,
|
|
base_mask_for_term,
|
|
)
|
|
for loaded_term, adj_array in zip_longest(to_load, loaded):
|
|
workspace[loaded_term] = adj_array
|
|
else:
|
|
if term.windowed:
|
|
compute = term.compute_from_windows
|
|
else:
|
|
compute = term.compute_from_arrays
|
|
workspace[term] = compute(
|
|
self._inputs_for_term(term, workspace, extra_row_counts),
|
|
base_mask_for_term,
|
|
)
|
|
return workspace
|
|
|
|
def _format_factor_matrix(self,
|
|
dates,
|
|
assets,
|
|
filter_data,
|
|
factor_data,
|
|
factor_names):
|
|
"""
|
|
Convert raw computed filters/factors into a DataFrame for public APIs.
|
|
|
|
Parameters
|
|
----------
|
|
dates : np.array[datetime64]
|
|
Index for raw data in filter_data/factor_data.
|
|
assets : np.array[int64]
|
|
Column labels for raw data in filter_data/factor_data.
|
|
filter_data : list[ndarray[bool]]
|
|
Raw filters data.
|
|
factor_data : list[ndarray]
|
|
Raw factor data.
|
|
factor_names : list[str]
|
|
Names of factors to use as keys.
|
|
|
|
Returns
|
|
-------
|
|
factor_matrix : pd.DataFrame
|
|
A DataFrame with the following indices:
|
|
|
|
index : two-tiered MultiIndex of (date, asset). For each date, we
|
|
return a row for each asset that passed all filters on that
|
|
date.
|
|
columns : keys from `factor_data`
|
|
|
|
Each date/asset/factor triple contains the computed value of the given
|
|
factor on the given date for the given asset.
|
|
"""
|
|
# FUTURE OPTIMIZATION: Cythonize all of this.
|
|
|
|
# Boolean mask of values that passed all filters.
|
|
unioned = reduce(and_, filter_data)
|
|
|
|
# Parallel arrays of (x,y) coords for all date/asset pairs that passed
|
|
# all filters. Each entry here will correspond to a row in our output
|
|
# frame.
|
|
nonzero_xs, nonzero_ys = unioned.nonzero()
|
|
|
|
raw_dates_index = empty_like(nonzero_xs, dtype='datetime64[ns]')
|
|
raw_assets_index = empty_like(nonzero_xs, dtype=int)
|
|
factor_outputs = [
|
|
empty_like(nonzero_xs, dtype=factor.dtype)
|
|
for factor in factor_data
|
|
]
|
|
|
|
# This is tricky.
|
|
|
|
# unioned.sum(axis=1) gives us an array of the same size as `dates`
|
|
# containing, for each date, the number of assets that passed our
|
|
# filters on that date.
|
|
|
|
# Running this through add.accumulate gives us an array containing, for
|
|
# each date, the running total of the number of assets that passed our
|
|
# filters on or before that date.
|
|
|
|
# This means that (bounds[i - 1], bounds[i]) gives us the slice bounds
|
|
# of rows in our output DataFrame corresponding to each date.
|
|
dt_start = 0
|
|
bounds = add.accumulate(unioned.sum(axis=1))
|
|
for dt_idx, dt_end in enumerate(bounds):
|
|
|
|
row_bounds = slice(dt_start, dt_end)
|
|
column_indices = nonzero_ys[row_bounds]
|
|
|
|
raw_dates_index[row_bounds] = dates[dt_idx]
|
|
raw_assets_index[row_bounds] = assets[column_indices]
|
|
for computed, output in zip(factor_data, factor_outputs):
|
|
output[row_bounds] = computed[dt_idx, column_indices]
|
|
|
|
# Upper bound of current row becomes lower bound for next row.
|
|
dt_start = dt_end
|
|
|
|
return DataFrame(
|
|
dict(zip(factor_names, factor_outputs)),
|
|
index=MultiIndex.from_arrays(
|
|
[raw_dates_index, raw_assets_index],
|
|
)
|
|
).tz_localize('UTC', level=0)
|