BUG: Clear asset caches when mapping DataFrame.

Our DataFrame index resolution logic relies on failed lookups **not**
being cached, but not caching failed lookups is a nontrivial performance
hit when repeatedly looking up sids.  The "solution" here is to clear
the caches after writing in new assets.

The real fix for this is either:

1. Don't construct an AssetFinder until we have the datasource in hand
   in run(), or
2. Don't symbol-map the user's input source if it's a DataFrame.
   Instead we should make our data loaders pre-map the data.
This commit is contained in:
Scott Sanderson
2015-11-03 16:57:17 -05:00
parent 5bddf32062
commit 654edaa851
2 changed files with 24 additions and 6 deletions
+11 -4
View File
@@ -21,8 +21,10 @@ from pandas.tseries.tools import normalize_date
import numpy as np
from datetime import datetime
from itertools import groupby, chain, repeat
from numbers import Integral
from operator import attrgetter
from six.moves import filter
from six import (
exec_,
@@ -30,7 +32,6 @@ from six import (
itervalues,
string_types,
)
from operator import attrgetter
from zipline.errors import (
@@ -96,6 +97,7 @@ from zipline.utils.events import (
TimeRuleFactory,
)
from zipline.utils.factory import create_simulation_parameters
from zipline.utils.functional import is_a
from zipline.utils.math_utils import tolerant_equals
from zipline.utils.preprocess import preprocess
@@ -608,8 +610,7 @@ class TradingAlgorithm(object):
if isinstance(identifier, Asset):
asset = self.asset_finder.retrieve_asset(sid=identifier.sid,
default_none=True)
elif hasattr(identifier, '__int__'):
elif isinstance(identifier, Integral):
asset = self.asset_finder.retrieve_asset(sid=identifier,
default_none=True)
if asset is None:
@@ -618,6 +619,12 @@ class TradingAlgorithm(object):
self.trading_environment.write_data(
equities_identifiers=identifiers_to_build)
# We need to clear out any cache misses that were stored while trying
# to do lookups. The real fix for this problem is to not construct an
# AssetFinder until we `run()` when we actually have all the data we
# need to so.
self.asset_finder._reset_caches()
return self.asset_finder.map_identifier_index_to_sids(
identifiers, as_of_date,
)
+13 -2
View File
@@ -99,12 +99,23 @@ class AssetFinder(object):
#
# The caches are read through, i.e. accessing an asset through
# retrieve_asset will populate the cache on first retrieval.
self._asset_cache = {}
self._asset_type_cache = {}
self._caches = (self._asset_cache, self._asset_type_cache) = {}, {}
# Populated on first call to `lifetimes`.
self._asset_lifetimes = None
def _reset_caches(self):
"""
Reset our asset caches.
You probably shouldn't call this method.
"""
# This method exists as a workaround for the in-place mutating behavior
# of `TradingAlgorithm._write_and_map_id_index_to_sids`. No one else
# should be calling this.
for cache in self._caches:
cache.clear()
def lookup_asset_types(self, sids):
"""
Retrieve asset types for a list of sids.