""" Compute Engine for FFC API """ from abc import ( ABCMeta, abstractmethod, ) from operator import and_ from six import ( iteritems, itervalues, with_metaclass, ) from six.moves import ( reduce, zip_longest, ) from numpy import ( add, empty_like, ) from pandas import ( DataFrame, date_range, MultiIndex, ) from zipline.lib.adjusted_array import ensure_ndarray from zipline.errors import NoFurtherDataError from zipline.utils.pandas_utils import explode from .classifier import Classifier from .factor import Factor from .filter import Filter from .graph import TermGraph from .term import AssetExists class FFCEngine(with_metaclass(ABCMeta)): @abstractmethod def factor_matrix(self, terms, start_date, end_date): """ Compute values for `terms` between `start_date` and `end_date`. Returns a DataFrame with a MultiIndex of (date, asset) pairs on the index. On each date, we return a row for each asset that passed all instances of `Filter` in `terms, and the columns of the returned frame will be the keys in `terms` whose values are instances of `Factor`. Parameters ---------- terms : dict[str -> zipline.modelling.term.Term] Dict mapping term names to instances. The supplied names are used as column names in our output frame. start_date : pd.Timestamp Start date of the computed matrix. end_date : pd.Timestamp End date of the computed matrix. Returns ------- matrix : pd.DataFrame A matrix of computed results. """ raise NotImplementedError("factor_matrix") class NoOpFFCEngine(FFCEngine): """ FFCEngine that doesn't do anything. """ def factor_matrix(self, terms, start_date, end_date): return DataFrame( index=MultiIndex.from_product( [date_range(start=start_date, end=end_date, freq='D'), ()], ), columns=sorted(terms.keys()) ) class SimpleFFCEngine(object): """ FFC Engine class that computes each term independently. Parameters ---------- loader : FFCLoader A loader to use to retrieve raw data for atomic terms. calendar : DatetimeIndex Array of dates to consider as trading days when computing a range between a fixed start and end. asset_finder : zipline.assets.AssetFinder An AssetFinder instance. We depend on the AssetFinder to determine which assets are in the top-level universe at any point in time. """ __slots__ = [ '_loader', '_calendar', '_finder', '_root_mask_term', '__weakref__', ] def __init__(self, loader, calendar, asset_finder): self._loader = loader self._calendar = calendar self._finder = asset_finder self._root_mask_term = AssetExists() def factor_matrix(self, terms, start_date, end_date): """ Compute a factor matrix. Parameters ---------- terms : dict[str -> zipline.modelling.term.Term] Dict mapping term names to instances. The supplied names are used as column names in our output frame. start_date : pd.Timestamp Start date of the computed matrix. end_date : pd.Timestamp End date of the computed matrix. The algorithm implemented here can be broken down into the following stages: 0. Build a dependency graph of all terms in `terms`. Topologically sort the graph to determine an order in which we can compute the terms. 1. Ask our AssetFinder for a "lifetimes matrix", which should contain, for each date between start_date and end_date, a boolean value for each known asset indicating whether the asset existed on that date. 2. Compute each term in the dependency order determined in (0), caching the results in a a dictionary to that they can be fed into future terms. 3. For each date, determine the number of assets passing **all** filters. The sum, N, of all these values is the total number of rows in our output frame, so we pre-allocate an output array of length N for each factor in `terms`. 4. Fill in the arrays allocated in (3) by copying computed values from our output cache into the corresponding rows. 5. Stick the values computed in (4) into a DataFrame and return it. Step 0 is performed by `zipline.modelling.graph.TermGraph`. Step 1 is performed in `self._compute_root_mask`. Step 2 is performed in `self.compute_chunk`. Steps 3, 4, and 5 are performed in self._format_factor_matrix. See Also -------- FFCEngine.factor_matrix """ if end_date <= start_date: raise ValueError( "start_date must be before end_date \n" "start_date=%s, end_date=%s" % (start_date, end_date) ) graph = TermGraph(terms) extra_rows = graph.extra_rows[self._root_mask_term] root_mask = self._compute_root_mask(start_date, end_date, extra_rows) dates, assets, root_mask_values = explode(root_mask) raw_outputs = self.compute_chunk( graph, dates, assets, initial_workspace={self._root_mask_term: root_mask_values}, ) # Collect the results that we'll actually show to the user. filters, factors = {}, {} for name, term in iteritems(terms): if isinstance(term, Filter): filters[name] = raw_outputs[name] elif isinstance(term, Factor): factors[name] = raw_outputs[name] elif isinstance(term, Classifier): continue else: raise ValueError("Unknown term type: %s" % term) # Add the root mask as an implicit filter, truncating off the extra # rows that we only needed to compute other terms. filters['base'] = root_mask_values[extra_rows:] out_dates = dates[extra_rows:] return self._format_factor_matrix(out_dates, assets, filters, factors) def _compute_root_mask(self, start_date, end_date, extra_rows): """ Compute a lifetimes matrix from our AssetFinder, then drop columns that didn't exist at all during the query dates. Parameters ---------- start_date : pd.Timestamp Base start date for the matrix. end_date : pd.Timestamp End date for the matrix. extra_rows : int Number of extra rows to compute before `start_date`. Extra rows are needed by terms like moving averages that require a trailing window of data. Returns ------- lifetimes : pd.DataFrame Frame of dtype `bool` containing dates from `extra_rows` days before `start_date`, continuing through to `end_date`. The returned frame contains as columns all assets in our AssetFinder that existed for at least one day between `start_date` and `end_date`. """ calendar = self._calendar finder = self._finder start_idx, end_idx = self._calendar.slice_locs(start_date, end_date) if start_idx < extra_rows: raise NoFurtherDataError( msg="Insufficient data to compute FFC Matrix: " "start date was %s, " "earliest known date was %s, " "and %d extra rows were requested." % ( start_date, calendar[0], extra_rows, ), ) # Build lifetimes matrix reaching back to `extra_rows` days before # `start_date.` lifetimes = finder.lifetimes( calendar[start_idx - extra_rows:end_idx], include_start_date=False ) assert lifetimes.index[extra_rows] == start_date assert lifetimes.index[-1] == end_date if not lifetimes.columns.unique: columns = lifetimes.columns duplicated = columns[columns.duplicated()].unique() raise AssertionError("Duplicated sids: %d" % duplicated) # Filter out columns that didn't exist between the requested start and # end dates. existed = lifetimes.iloc[extra_rows:].any() return lifetimes.loc[:, existed] def _mask_and_dates_for_term(self, term, workspace, graph, dates): """ Load mask and mask row labels for term. """ mask = term.mask offset = graph.extra_rows[mask] - graph.extra_rows[term] return workspace[mask][offset:], dates[offset:] def _inputs_for_term(self, term, workspace, graph): """ Compute inputs for the given term. This is mostly complicated by the fact that for each input we store as many rows as will be necessary to serve **any** computation requiring that input. """ offsets = graph.offset if term.windowed: # If term is windowed, then all input data should be instances of # AdjustedArray. return [ workspace[input_].traverse( window_length=term.window_length, offset=offsets[term, input_] ) for input_ in term.inputs ] # If term is not windowed, input_data may be an AdjustedArray or # np.ndarray. Coerce the former to the latter. out = [] for input_ in term.inputs: input_data = ensure_ndarray(workspace[input_]) offset = offsets[term, input_] # OPTIMIZATION: Don't make a copy by doing input_data[0:] if # offset is zero. if offset: input_data = input_data[offset:] out.append(input_data) return out def compute_chunk(self, graph, dates, assets, initial_workspace): """ Compute the FFC terms in the graph for the requested start and end dates. Parameters ---------- graph : zipline.modelling.graph.TermGraph dates : pd.DatetimeIndex Row labels for our root mask. assets : pd.Int64Index Column labels for our root mask. initial_workspace : dict Map from term -> output. Must contain at least entry for `self._root_mask_term` whose shape is `(len(dates), len(assets))`, but may contain additional pre-computed terms for testing or optimization purposes. Returns ------- results : dict Dictionary mapping requested results to outputs. """ self._validate_compute_chunk_params(dates, assets, initial_workspace) loader = self._loader # Copy the supplied initial workspace so we don't mutate it in place. workspace = initial_workspace.copy() for term in graph.ordered(): # `term` may have been supplied in `initial_workspace`, and in the # future we may pre-compute atomic terms coming from the same # dataset. In either case, we will already have an entry for this # term, which we shouldn't re-compute. if term in workspace: continue # Asset labels are always the same, but date labels vary by how # many extra rows are needed. mask, mask_dates = self._mask_and_dates_for_term( term, workspace, graph, dates ) if term.atomic: # FUTURE OPTIMIZATION: Scan the resolution order for terms in # the same dataset and load them here as well. to_load = [term] loaded = loader.load_adjusted_array( to_load, mask_dates, assets, mask, ) assert len(to_load) == len(loaded) for loaded_term, adj_array in zip_longest(to_load, loaded): workspace[loaded_term] = adj_array else: workspace[term] = term._compute( self._inputs_for_term(term, workspace, graph), mask_dates, assets, mask, ) assert(workspace[term].shape == mask.shape) out = {} graph_extra_rows = graph.extra_rows for name, term in iteritems(graph.outputs): # Truncate off extra rows from outputs. out[name] = workspace[term][graph_extra_rows[term]:] return out def _format_factor_matrix(self, dates, assets, filters, factors): """ Convert raw computed filters/factors into a DataFrame for public APIs. Parameters ---------- dates : np.array[datetime64] Row index for arrays in `filters` and `factors.` assets : np.array[int64] Column index for arrays in `filters` and `factors.` filters : dict Dict mapping filter names -> computed filters. factors : dict Dict mapping factor names -> computed factors. Returns ------- factor_matrix : pd.DataFrame The indices of `factor_matrix` are as follows: index : two-tiered MultiIndex of (date, asset). For each date, we return a row for each asset that passed all filters on that date. columns : keys from `factor_data` Each date/asset/factor triple contains the computed value of the given factor on the given date for the given asset. """ # FUTURE OPTIMIZATION: Cythonize all of this. # Boolean mask of values that passed all filters. unioned = reduce(and_, itervalues(filters)) # Parallel arrays of (x,y) coords for (date, asset) pairs that passed # all filters. Each entry here will correspond to a row in our output # frame. nonzero_xs, nonzero_ys = unioned.nonzero() # Raw arrays storing (date, asset) pairs. # These will form the index of our output frame. raw_dates_index = empty_like(nonzero_xs, dtype='datetime64[ns]') raw_assets_index = empty_like(nonzero_xs, dtype=int) # Mapping from column_name -> array. # This will be the `data` arg to our output frame. columns = { name: empty_like(nonzero_xs, dtype=factor.dtype) for name, factor in iteritems(factors) } # We're going to iterate over `iteritems(columns)` a whole bunch of # times down below. It's faster to construct iterate over a tuple of # pairs. columns_iter = tuple(iteritems(columns)) # This is tricky. # unioned.sum(axis=1) gives us an array of the same size as `dates` # containing, for each date, the number of assets that passed our # filters on that date. # Running this through add.accumulate gives us an array containing, for # each date, the running total of the number of assets that passed our # filters on or before that date. # This means that (bounds[i - 1], bounds[i]) gives us the indices of # the first and last rows in our output frame for each date in `dates`. bounds = add.accumulate(unioned.sum(axis=1)) day_start = 0 for day_idx, day_end in enumerate(bounds): day_bounds = slice(day_start, day_end) column_indices = nonzero_ys[day_bounds] raw_dates_index[day_bounds] = dates[day_idx] raw_assets_index[day_bounds] = assets[column_indices] for name, colarray in columns_iter: colarray[day_bounds] = factors[name][day_idx, column_indices] # Upper bound of current row becomes lower bound for next row. day_start = day_end return DataFrame( data=columns, index=MultiIndex.from_arrays( [ raw_dates_index, # FUTURE OPTIMIZATION: # Avoid duplicate lookups by grouping and only looking up # each unique sid once. self._finder.retrieve_all(raw_assets_index), ], ) ).tz_localize('UTC', level=0) def _validate_compute_chunk_params(self, dates, assets, initial_workspace): """ Verify that the values passed to compute_chunk are well-formed. """ root = self._root_mask_term clsname = type(self).__name__ # Writing this out explicitly so this errors in testing if we change # the name without updating this line. compute_chunk_name = self.compute_chunk.__name__ if root not in initial_workspace: raise AssertionError( "root_mask values not supplied to {cls}.{method}".format( cls=clsname, method=compute_chunk_name, ) ) shape = initial_workspace[root].shape implied_shape = len(dates), len(assets) if shape != implied_shape: raise AssertionError( "root_mask shape is {shape}, but received dates/assets " "imply that shape should be {implied}".format( shape=shape, implied=implied_shape, ) )