ENH: Adds AssetFinder and asset metadata management

This commit is contained in:
jfkirk
2015-06-08 15:53:48 -04:00
parent 9da644f767
commit 035bfbd514
4 changed files with 1115 additions and 34 deletions
+489 -11
View File
@@ -20,7 +20,177 @@ Tests for the zipline.assets package
import sys
from unittest import TestCase
from zipline.assets._assets import Asset, Future
from datetime import (
datetime,
timedelta,
)
import pickle
import pprint
import pytz
import uuid
import pandas as pd
from nose_parameterized import parameterized
from zipline.finance.trading import with_environment
from zipline.assets import Asset, Future, AssetFinder
from zipline.errors import (
SymbolNotFound,
MultipleSymbolsFound,
)
class FakeTable(object):
def __init__(self, name, count, dt, fuzzy_str):
self.name = name
self.count = count
self.dt = dt
self.fuzzy_str = fuzzy_str
self.df = pd.DataFrame.from_records(
[
{
'sid': i,
'file_name': 'TEST%s%s' % (self.fuzzy_str, i),
'company_name': self.name + str(i),
'start_date_nano': pd.Timestamp(dt, tz='UTC').value,
'end_date_nano': pd.Timestamp(dt, tz='UTC').value,
'exchange': self.name,
}
for i in range(1, self.count + 1)
]
)
def read(self, *args, **kwargs):
return self.df.to_records()
class FakeTableIdenticalSymbols(object):
def __init__(self, name, as_of_dates):
self.name = name
self.as_of_dates = as_of_dates
self.df = pd.DataFrame.from_records(
[
{
'sid': i,
'file_name': self.name,
'company_name': self.name,
'start_date_nano': date.value,
'end_date_nano': (date + timedelta(days=1)).value,
'exchange': self.name,
}
for i, date in enumerate(self.as_of_dates)
]
)
def read(self, *args, **kwargs):
return self.df.to_records()
class FakeTableFromRecords(object):
def __init__(self, records):
self.records = records
self.df = pd.DataFrame.from_records(self.records)
def read(self, *args, **kwargs):
return self.df.to_records()
@with_environment()
def build_lookup_generic_cases(env=None):
"""
Generate test cases for AssetFinder test_lookup_generic.
"""
unique_start = pd.Timestamp('2013-01-01', tz='UTC')
unique_end = pd.Timestamp('2014-01-01', tz='UTC')
dupe_0_start = pd.Timestamp('2013-01-01', tz='UTC')
dupe_0_end = dupe_0_start + timedelta(days=1)
dupe_1_start = pd.Timestamp('2013-01-03', tz='UTC')
dupe_1_end = dupe_1_start + timedelta(days=1)
table = FakeTableFromRecords(
[
{
'sid': 0,
'file_name': 'duplicated',
'company_name': 'duplicated_0',
'start_date_nano': dupe_0_start.value,
'end_date_nano': dupe_0_end.value,
'exchange': '',
},
{
'sid': 1,
'file_name': 'duplicated',
'company_name': 'duplicated_1',
'start_date_nano': dupe_1_start.value,
'end_date_nano': dupe_1_end.value,
'exchange': '',
},
{
'sid': 2,
'file_name': 'unique',
'company_name': 'unique',
'start_date_nano': unique_start.value,
'end_date_nano': unique_end.value,
'exchange': '',
},
],
)
env.update_asset_finder(asset_metadata=table.df)
dupe_0, dupe_1, unique = assets = [
env.asset_finder.retrieve_asset(i)
for i in range(3)
]
# This expansion code is run at module import time, which means we have to
# clear the AssetFinder here or else it will interfere with the cache
# for other tests.
env.update_asset_finder(clear_metadata=True)
dupe_0_start = dupe_0.start_date
dupe_1_start = dupe_1.start_date
cases = [
##
# Scalars
# Asset object
(table, assets[0], None, assets[0]),
(table, assets[1], None, assets[1]),
(table, assets[2], None, assets[2]),
# int
(table, 0, None, assets[0]),
(table, 1, None, assets[1]),
(table, 2, None, assets[2]),
# Duplicated symbol with resolution date
(table, 'duplicated', dupe_0_start, dupe_0),
(table, 'duplicated', dupe_1_start, dupe_1),
# Unique symbol, with or without resolution date.
(table, 'unique', unique_start, unique),
(table, 'unique', None, unique),
##
# Iterables
# Iterables of Asset objects.
(table, assets, None, assets),
(table, iter(assets), None, assets),
# Iterables of ints
(table, (0, 1), None, assets[:-1]),
(table, iter((0, 1)), None, assets[:-1]),
# Iterables of symbols.
(table, ('duplicated', 'unique'), dupe_0_start, [dupe_0, unique]),
(table, ('duplicated', 'unique'), dupe_1_start, [dupe_1, unique]),
# Mixed types
(table,
('duplicated', 2, 'unique', 1, dupe_1),
dupe_0_start,
[dupe_0, assets[2], unique, assets[1], dupe_1]),
]
return cases
class AssetTestCase(TestCase):
@@ -35,8 +205,53 @@ class AssetTestCase(TestCase):
self.assertEquals(str(Asset(5061)), 'Asset(5061)')
def test_asset_is_pickleable(self):
class TestAssetRichCmp(TestCase):
# Very wow
s = Asset(
1337,
symbol="DOGE",
asset_name="DOGECOIN",
start_date=pd.Timestamp('2013-12-08 9:31AM', tz='UTC'),
end_date=pd.Timestamp('2014-06-25 11:21AM', tz='UTC'),
first_traded=pd.Timestamp('2013-12-08 9:31AM', tz='UTC'),
exchange='THE MOON',
)
s_unpickled = pickle.loads(pickle.dumps(s))
attrs_to_check = ['end_date',
'exchange',
'first_traded',
'asset_end_date',
'asset_name',
'asset_start_date',
'sid',
'start_date',
'symbol']
for attr in attrs_to_check:
self.assertEqual(getattr(s, attr), getattr(s_unpickled, attr))
def test_asset_comparisons(self):
s_23 = Asset(23)
s_24 = Asset(24)
self.assertEqual(s_23, s_23)
self.assertEqual(s_23, 23)
self.assertEqual(23, s_23)
self.assertNotEqual(s_23, s_24)
self.assertNotEqual(s_23, 24)
self.assertNotEqual(s_23, "23")
self.assertNotEqual(s_23, 23.5)
self.assertNotEqual(s_23, [])
self.assertNotEqual(s_23, None)
self.assertLess(s_23, s_24)
self.assertLess(s_23, 24)
self.assertGreater(24, s_23)
self.assertGreater(s_24, s_23)
def test_lt(self):
self.assertTrue(Asset(3) < Asset(4))
@@ -74,16 +289,279 @@ class TestAssetRichCmp(TestCase):
'a' < Asset(3)
class testFuture(TestCase):
class TestFuture(TestCase):
future = Future(2468,
symbol='OMK15',
notice_date='2014-01-20',
expiration_date='2014-02-20',
contract_multiplier=500)
def test_str(self):
strd = self.future.__str__()
self.assertEqual("Future(2468 [OMK15])", strd)
def test_repr(self):
reprd = self.future.__repr__()
self.assertTrue("Future" in reprd)
self.assertTrue("2468" in reprd)
self.assertTrue("OMK15" in reprd)
self.assertTrue("notice_date='2014-01-20'" in reprd)
self.assertTrue("expiration_date='2014-02-20'" in reprd)
self.assertTrue("contract_multiplier=500" in reprd)
future = Future(2468,
notice_date='2014-01-20',
expiration_date='2014-02-20')
rep = future.__repr__()
def test_reduce(self):
reduced = self.future.__reduce__()
self.assertEqual(Future, reduced[0])
self.assertTrue("Future" in rep)
self.assertTrue("2468" in rep)
self.assertTrue("notice_date='2014-01-20'" in rep)
self.assertTrue("expiration_date='2014-02-20'" in rep)
def test_to_and_from_dict(self):
dictd = self.future.to_dict()
self.assertTrue('notice_date' in dictd)
self.assertTrue('expiration_date' in dictd)
self.assertTrue('contract_multiplier' in dictd)
from_dict = Future.from_dict(dictd)
self.assertTrue(isinstance(from_dict, Future))
self.assertEqual(self.future, from_dict)
class AssetFinderTestCase(TestCase):
@with_environment()
def test_lookup_symbol_fuzzy(self, env=None):
fuzzy_str = '@'
as_of_date = datetime(2013, 1, 1, tzinfo=pytz.utc)
table = FakeTable(uuid.uuid4().hex, 2, as_of_date,
fuzzy_str)
env.update_asset_finder(asset_metadata=table.df)
sf = env.asset_finder
try:
for i in range(2): # we do it twice to test for caching bugs
self.assertIsNone(sf.lookup_symbol('test', as_of_date))
self.assertIsNotNone(sf.lookup_symbol(
'test%s%s' % (fuzzy_str, 1), as_of_date))
self.assertIsNone(sf.lookup_symbol('test%s' % 1, as_of_date))
self.assertIsNone(sf.lookup_symbol(table.name, as_of_date,
fuzzy=fuzzy_str))
self.assertIsNotNone(sf.lookup_symbol(
'test%s%s' % (fuzzy_str, 1), as_of_date, fuzzy=fuzzy_str))
self.assertIsNotNone(sf.lookup_symbol(
'test%s' % 1, as_of_date, fuzzy=fuzzy_str))
finally:
env.update_asset_finder(clear_metadata=True)
@with_environment()
def test_lookup_symbol_resolve_multiple(self, env=None):
as_of_dates = [
pd.Timestamp('2013-01-01', tz='UTC') + timedelta(days=i)
# Incrementing by two so that start and end dates for each
# generated Asset don't overlap (each Asset's end_date is the
# day after its start date.)
for i in range(0, 10, 2)
]
table = FakeTableIdenticalSymbols(
name='existing',
as_of_dates=as_of_dates,
)
env.update_asset_finder(asset_metadata=table.df)
sf = env.asset_finder
try:
for _ in range(2): # we do it twice to test for caching bugs
with self.assertRaises(SymbolNotFound):
sf.lookup_symbol_resolve_multiple('non_existing',
as_of_dates[0])
with self.assertRaises(MultipleSymbolsFound):
sf.lookup_symbol_resolve_multiple('existing',
None)
for i, date in enumerate(as_of_dates):
# Verify that we correctly resolve multiple symbols using
# the supplied date
result = sf.lookup_symbol_resolve_multiple(
'existing',
date,
)
self.assertEqual(result.symbol, 'existing')
self.assertEqual(result.sid, i)
finally:
env.update_asset_finder(clear_metadata=True)
@with_environment()
def test_lookup_symbol_nasdaq_underscore_collisions(self, env=None):
"""
Ensure that each NASDAQ symbol without underscores maps back to the
original symbol when using fuzzy matching.
"""
sf = env.asset_finder
fuzzy_str = '_'
collisions = []
try:
for sid in sf.sids:
sec = sf.retrieve_asset(sid)
if sec.exchange.startswith('NASDAQ'):
found = sf.lookup_symbol(sec.symbol.replace(fuzzy_str, ''),
sec.end_date, fuzzy=fuzzy_str)
if found != sec:
collisions.append((found, sec))
# KNOWN BUG: Filter out assets that have intersections in their
# start and end dates. We can't correctly resolve these.
unexpected_errors = []
for first, second in collisions:
overlapping_dates = (
first.end_date >= second.start_date or
second.end_date >= first.end_date
)
if not overlapping_dates:
unexpected_errors.append((first, second))
self.assertFalse(
unexpected_errors,
pprint.pformat(unexpected_errors),
)
finally:
env.update_asset_finder(clear_metadata=True)
@parameterized.expand(
build_lookup_generic_cases()
)
@with_environment()
def test_lookup_generic(self, table, symbols, reference_date, expected,
env=None):
"""
Ensure that lookup_generic works with various permutations of inputs.
"""
try:
env.update_asset_finder(asset_metadata=table.df)
finder = env.asset_finder
results, missing = finder.lookup_generic(symbols, reference_date)
self.assertEqual(results, expected)
self.assertEqual(missing, [])
finally:
env.update_asset_finder(clear_metadata=True)
@with_environment()
def test_lookup_generic_handle_missing(self, env=None):
try:
table = FakeTableFromRecords(
[
# Sids that will be found when we do lookups.
{
'sid': 0,
'file_name': 'real',
'company_name': 'real',
'start_date_nano': pd.Timestamp('2013-1-1', tz='UTC'),
'end_date_nano': pd.Timestamp('2014-1-1', tz='UTC'),
'exchange': '',
},
{
'sid': 1,
'file_name': 'also_real',
'company_name': 'also_real',
'start_date_nano': pd.Timestamp('2013-1-1', tz='UTC'),
'end_date_nano': pd.Timestamp('2014-1-1', tz='UTC'),
'exchange': '',
},
# Sid whose end date is before our query date. We should
# still correctly find it.
{
'sid': 2,
'file_name': 'real_but_old',
'company_name': 'real_but_old',
'start_date_nano': pd.Timestamp('2002-1-1', tz='UTC'),
'end_date_nano': pd.Timestamp('2003-1-1', tz='UTC'),
'exchange': '',
},
# Sid whose end date is before our query date. We should
# still correctly find it.
{
'sid': 3,
'file_name': 'real_but_in_the_future',
'company_name': 'real_but_in_the_future',
'start_date_nano': pd.Timestamp('2014-1-1', tz='UTC'),
'end_date_nano': pd.Timestamp('2020-1-1', tz='UTC'),
'exchange': 'THE FUTURE',
},
]
)
env.update_asset_finder(asset_metadata=table.df)
symbols = [
'real', 1, 'fake', 'real_but_old', 'real_but_in_the_future',
]
results, missing = env.asset_finder.lookup_generic(
symbols,
pd.Timestamp('2013-02-01', tz='UTC'),
)
self.assertEqual(len(results), 3)
self.assertEqual(results[0].symbol, 'real')
self.assertEqual(results[0].sid, 0)
self.assertEqual(results[1].symbol, 'also_real')
self.assertEqual(results[1].sid, 1)
self.assertEqual(len(missing), 2)
self.assertEqual(missing[0], 'fake')
self.assertEqual(missing[1], 'real_but_in_the_future')
finally:
env.update_asset_finder(clear_metadata=True)
def test_insert_metadata(self):
finder = AssetFinder()
finder.insert_metadata(0,
asset_type='equity',
start_date='2014-01-01',
end_date='2015-01-01',
symbol="PLAY",
foo_data="FOO",)
# Test proper insertion
self.assertEqual('equity', finder.metadata_cache[0]['asset_type'])
self.assertEqual('PLAY', finder.metadata_cache[0]['symbol'])
self.assertEqual('2015-01-01', finder.metadata_cache[0]['end_date'])
# Test invalid field
self.assertFalse('foo_data' in finder.metadata_cache[0])
# Test updating fields
finder.insert_metadata(0,
asset_type='equity',
start_date='2014-01-01',
end_date='2015-02-01',
symbol="PLAY",
exchange="NYSE",)
self.assertEqual('2015-02-01', finder.metadata_cache[0]['end_date'])
self.assertEqual('NYSE', finder.metadata_cache[0]['exchange'])
# Check that old data survived
self.assertEqual('PLAY', finder.metadata_cache[0]['symbol'])
def test_consume_metadata(self):
# Test dict consumption
finder = AssetFinder({0: {'asset_type': 'equity'}})
dict_to_consume = {0: {'symbol': 'PLAY'},
1: {'symbol': 'MSFT'}}
finder.consume_metadata(dict_to_consume)
self.assertEqual('equity', finder.metadata_cache[0]['asset_type'])
self.assertEqual('PLAY', finder.metadata_cache[0]['symbol'])
# Test dataframe consumption
df = pd.DataFrame(columns=['asset_name', 'exchange'], index=[0, 1])
df['asset_name'][0] = "Dave'N'Busters"
df['exchange'][0] = "NASDAQ"
df['asset_name'][1] = "Microsoft"
df['exchange'][1] = "NYSE"
finder.consume_metadata(df)
self.assertEqual('NASDAQ', finder.metadata_cache[0]['exchange'])
self.assertEqual('Microsoft', finder.metadata_cache[1]['asset_name'])
# Check that old data survived
self.assertEqual('equity', finder.metadata_cache[0]['asset_type'])
+22
View File
@@ -12,3 +12,25 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ._assets import (
Asset,
Equity,
Future,
make_asset_array,
CACHE_FILE_TEMPLATE
)
from .assets import (
AssetFinder,
AssetConvertible
)
__all__ = [
'Asset',
'Equity',
'Future',
'AssetFinder',
'AssetConvertible',
'make_asset_array',
'CACHE_FILE_TEMPLATE'
]
+63 -23
View File
@@ -21,9 +21,10 @@ cimport cython
import numpy as np
cimport numpy as np
cdef enum AssetType:
EQUITY = 1
FUTURE = 2
# IMPORTANT NOTE: You must change this template if you change
# Asset.__reduce__, or else we'll attempt to unpickle an old version of this
# class
CACHE_FILE_TEMPLATE = '/tmp/.%s-%s.v4.cache'
cdef class Asset:
@@ -33,9 +34,7 @@ cdef class Asset:
cdef readonly object symbol
cdef readonly object asset_name
cdef readonly AssetType asset_type
# TODO: Maybe declare as pandas Timestamp?
cdef readonly object start_date
cdef readonly object end_date
cdef public object first_traded
@@ -179,26 +178,21 @@ cdef class Asset:
'exchange': self.exchange,
}
@staticmethod
def from_dict(dict_):
@classmethod
def from_dict(cls, dict_):
"""
Build an Asset instance from a dict.
"""
return Asset(**dict_)
return cls(**dict_)
cdef class Equity(Asset):
def __cinit__(self,
int sid, # sid is required
object symbol="",
object asset_name="",
object start_date=None,
object end_date=None,
object first_traded=None,
object exchange=""):
self.asset_type = EQUITY
def __str__(self):
if self.symbol:
return 'Equity(%d [%s])' % (self.sid, self.symbol)
else:
return 'Equity(%d)' % self.sid
def __repr__(self):
attrs = ('symbol', 'asset_name', 'exchange',
@@ -214,6 +208,7 @@ cdef class Future(Asset):
cdef readonly object notice_date
cdef readonly object expiration_date
cdef readonly int contract_multiplier
def __cinit__(self,
int sid, # sid is required
@@ -224,18 +219,63 @@ cdef class Future(Asset):
object notice_date=None,
object expiration_date=None,
object first_traded=None,
object exchange=""):
object exchange="",
int contract_multiplier=1):
self.asset_type = FUTURE
self.notice_date = notice_date
self.expiration_date = expiration_date
self.notice_date = notice_date
self.expiration_date = expiration_date
self.contract_multiplier = contract_multiplier
# Assign the expiration as the end_date if end_date is not explicit
if self.end_date is None:
self.end_date = expiration_date
def __str__(self):
if self.symbol:
return 'Future(%d [%s])' % (self.sid, self.symbol)
else:
return 'Future(%d)' % self.sid
def __repr__(self):
attrs = ('symbol', 'asset_name', 'exchange',
'start_date', 'end_date', 'first_traded', 'notice_date',
'expiration_date')
'expiration_date', 'contract_multiplier')
tuples = ((attr, repr(getattr(self, attr, None)))
for attr in attrs)
strings = ('%s=%s' % (t[0], t[1]) for t in tuples)
params = ', '.join(strings)
return 'Future(%d, %s)' % (self.sid, params)
cpdef __reduce__(self):
"""
Function used by pickle to determine how to serialize/deserialize this
class. Should return a tuple whose first element is self.__class__,
and whose second element is a tuple of all the attributes that should
be serialized/deserialized during pickling.
"""
return (self.__class__, (self.sid,
self.symbol,
self.asset_name,
self.start_date,
self.end_date,
self.notice_date,
self.expiration_date,
self.first_traded,
self.exchange,
self.contract_multiplier,))
cpdef to_dict(self):
"""
Convert to a python dict.
"""
super_dict = super(Future, self).to_dict()
super_dict['notice_date'] = self.notice_date
super_dict['expiration_date'] = self.expiration_date
super_dict['contract_multiplier'] = self.contract_multiplier
return super_dict
def make_asset_array(int size, Asset asset):
cdef np.ndarray out = np.empty([size], dtype=object)
out.fill(asset)
return out
+541
View File
@@ -0,0 +1,541 @@
#
# Copyright 2015 Quantopian, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from abc import ABCMeta
from itertools import chain
from numbers import Integral
import numpy as np
import operator
from logbook import Logger
import pandas as pd
from pandas.tseries.tools import normalize_date
from six import with_metaclass, string_types
from zipline.errors import (
SymbolNotFound,
MultipleSymbolsFound,
SidNotFound,
IdentifierNotFound,
ConsumeAssetMetaDataError,
InvalidAssetType,
)
from zipline.utils import tradingcalendar
from zipline.assets._assets import (
Asset, Equity, Future
)
log = Logger('assets.py')
# Expected fields for an Asset's metadata
ASSET_FIELDS = [
'sid',
'asset_type',
'symbol',
'asset_name',
'start_date',
'end_date',
'first_traded',
'exchange',
'notice_date',
'expiration_date',
'contract_multiplier',
# The following fields are for compatibility with other systems
'file_name', # Used as symbol
'company_name', # Used as asset_name
'start_date_nano', # Used as start_date
'end_date_nano', # Used as end_date
]
class AssetFinder(object):
def __init__(self,
metadata=None,
trading_calendar=tradingcalendar):
# Any particular instance of AssetFinder should be
# consistent throughout its lifetime, so we grab a reference
# to our cache now. That way, if the cache is refreshed later,
# our instance will continue to use the old one.
self.cache = {}
self.sym_cache = {}
self.identifier_cache = {}
self.fuzzy_match = {}
# The AssetFinder also holds a nested-dict of all metadata for
# reference when building Assets
self.metadata_cache = {}
if metadata:
self.consume_metadata(metadata)
self.trading_calendar = trading_calendar
self.populate_cache()
def _next_free_sid(self):
if len(self.cache) > 0:
return max(self.cache.keys()) + 1
return 0
def _assign_sid(self, identifier):
if hasattr(identifier, '__int__'):
return identifier.__int__()
if isinstance(identifier, string_types):
return self._next_free_sid()
def retrieve_asset(self, sid, default_none=False):
if isinstance(sid, Asset):
return sid
asset = self.cache.get(sid)
if asset is not None:
return asset
elif default_none:
return None
else:
raise SidNotFound(sid=sid)
def retrieve_asset_by_identifier(self, identifier):
if isinstance(identifier, Asset):
return identifier
asset = self.identifier_cache.get(identifier)
if asset is not None:
return asset
else:
raise IdentifierNotFound(identifier=identifier)
@staticmethod
def _lookup_symbol_in_infos(infos, as_of_date):
"""
Search a list of symbols matching a given asset for the most recent
known symbol as of as_of_date.
Returns a pair of (Asset, bool), representing the best match we
found for as_of_date, and whether or not that match was actually
trading at as_of_date.
If no entry in infos started before as_of_date, return (None, False).
"""
# Sort entries by end_date before iterating. If asset start and end
# dates were always disjoint, then we could sort by either start or
# end_date and get the same sorting.
infos = sorted(infos, key=operator.attrgetter('end_date'))
# Find the newest asset that started before as_of_date.
candidates = [i for i in infos
if (i.start_date is None or i.start_date <= as_of_date)
and (i.end_date is None or as_of_date <= i.end_date)]
# If one SID exists for symbol, return that symbol
if len(candidates) == 1:
return candidates[0], True
# If no SID exists for symbol, return SID with the
# highest-but-not-over end_date
if len(candidates) == 0:
candidates = [i for i in infos
if i.end_date < as_of_date]
return (candidates[-1], False) if candidates else (None, False)
# If multiple SIDs exist for symbol, return latest start_date with
# end_date as a tie-breaker
if len(candidates) > 1:
best_candidate = sorted(
candidates,
key=lambda x: (x.start_date, x.end_date)
)[-1]
return best_candidate, True
def lookup_symbol_resolve_multiple(self, symbol, as_of_date=None):
"""
Return matching Asset of name symbol in database.
If multiple Assets are found and as_of_date is not set,
raises MultipleSymbolsFound.
If no Asset was active at as_of_date, and allow_expired is False
raises SymbolNotFound.
"""
if as_of_date is not None:
as_of_date = normalize_date(as_of_date)
if symbol not in self.sym_cache:
raise SymbolNotFound(symbol=symbol)
infos = self.sym_cache[symbol]
if as_of_date is None:
if len(infos) == 1:
return infos[0]
else:
raise MultipleSymbolsFound(symbol=symbol,
options=str(infos))
# Try to find symbol matching as_of_date
asset, _ = self._lookup_symbol_in_infos(infos, as_of_date)
if asset is None:
raise SymbolNotFound(symbol=symbol)
return asset
def lookup_symbol(self, symbol, as_of_date, fuzzy=None):
"""
If a fuzzy string is provided, then we try various symbols based on
the provided symbol. This is to facilitate mapping from a broker's
symbol to ours in cases where mapping to the broker's symbol loses
information. For example, if we have CMCS_A, but a broker has CMCSA,
when the broker provides CMCSA, it can also provide fuzzy='_',
so we can find a match by inserting an underscore.
"""
symbol = symbol.upper()
as_of_date = normalize_date(as_of_date)
if not fuzzy:
try:
return self.lookup_symbol_resolve_multiple(symbol, as_of_date)
except SymbolNotFound:
return None
else:
try:
return self.fuzzy_match[(symbol, fuzzy, as_of_date)]
except KeyError:
# if symbol is CMCSA and fuzzy is '_', then
# try CMCSA, then CMCS_A, then CMC_SA, etc.
for fuzzy_symbol in chain(
(symbol,),
(symbol[:i] + fuzzy + symbol[i:]
for i in range(len(symbol) - 1, 0, -1))):
infos = self.sym_cache.get(fuzzy_symbol)
if infos:
info, date_match = self._lookup_symbol_in_infos(
infos,
as_of_date,
)
if info is not None and date_match:
self.fuzzy_match[(symbol, fuzzy, as_of_date)] = \
info
return info
else:
self.fuzzy_match[(symbol, fuzzy, as_of_date)] = None
def populate_cache(self):
"""
Populates the asset cache with all values in the assets
collection.
"""
# Wipe caches before repopulating
self.cache = {}
self.sym_cache = {}
self.identifier_cache = {}
self.fuzzy_match = {}
counter = 0
for identifier, row in self.metadata_cache.items():
self.spawn_asset(identifier=identifier, **row)
counter += 1
def spawn_asset(self, identifier, **kwargs):
# Check if the sid is declared
try:
kwargs['sid']
pass
except KeyError:
# Assign the identifier as the sid, if applicable
if isinstance(identifier, int):
kwargs['sid'] = identifier
# If the identifier is not a sid, assign one
else:
kwargs['sid'] = self._assign_sid(identifier)
# Update the metadata object with the new sid
self.insert_metadata(identifier=identifier, sid=kwargs['sid'])
# If the file_name is in the kwargs, it may be the symbol
try:
file_name = kwargs.pop('file_name')
try:
kwargs['symbol']
except KeyError:
kwargs['symbol'] = file_name
except KeyError:
pass
# If the identifier coming in was a string and there is no defined
# symbol yet, set the symbol to the incoming identifier
try:
kwargs['symbol']
pass
except KeyError:
if isinstance(identifier, string_types):
kwargs['symbol'] = identifier
# If the company_name is in the kwargs, it may be the asset_name
try:
company_name = kwargs.pop('company_name')
try:
kwargs['asset_name']
except KeyError:
kwargs['asset_name'] = company_name
except KeyError:
pass
# If dates are given as nanos, pop them
try:
kwargs['start_date'] = kwargs.pop('start_date_nano')
except KeyError:
pass
try:
kwargs['end_date'] = kwargs.pop('end_date_nano')
except KeyError:
pass
try:
kwargs['notice_date'] = kwargs.pop('notice_date_nano')
except KeyError:
pass
try:
kwargs['expiration_date'] = kwargs.pop('expiration_date_nano')
except KeyError:
pass
# Process dates to Timestamps
try:
kwargs['start_date'] = pd.Timestamp(kwargs['start_date'], tz='UTC')
except KeyError:
pass
try:
kwargs['end_date'] = pd.Timestamp(kwargs['end_date'], tz='UTC')
except KeyError:
pass
try:
kwargs['notice_date'] = pd.Timestamp(kwargs['notice_date'],
tz='UTC')
except KeyError:
pass
try:
kwargs['expiration_date'] = pd.Timestamp(kwargs['expiration_date'],
tz='UTC')
except KeyError:
pass
# Build an Asset of the appropriate type
asset_type = kwargs.pop('asset_type', 'equity')
if asset_type.lower() == 'equity':
asset = Equity(**kwargs)
elif asset_type.lower() == 'future':
asset = Future(**kwargs)
else:
raise InvalidAssetType(asset_type=asset_type)
self.cache[asset.sid] = asset
self.identifier_cache[identifier] = asset
if asset.symbol is not "":
self.sym_cache.setdefault(asset.symbol, []).append(asset)
return asset
@property
def sids(self):
return self.cache.keys()
@property
def assets(self):
return self.cache.values()
def _lookup_generic_scalar(self,
asset_convertible,
as_of_date,
matches,
missing):
"""
Convert asset_convertible to an asset.
On success, append to matches.
On failure, append to missing.
"""
try:
if isinstance(asset_convertible, Asset):
matches.append(asset_convertible)
elif isinstance(asset_convertible, Integral):
result = self.retrieve_asset(int(asset_convertible))
if result is None:
raise SymbolNotFound(symbol=asset_convertible)
matches.append(result)
elif isinstance(asset_convertible, string_types):
# Throws SymbolNotFound on failure to match.
matches.append(
self.lookup_symbol_resolve_multiple(
asset_convertible,
as_of_date,
)
)
else:
raise NotAssetConvertible(
"Input was %s, not AssetConvertible."
% asset_convertible
)
except SymbolNotFound:
missing.append(asset_convertible)
return None
def lookup_generic(self,
asset_convertible_or_iterable,
as_of_date):
"""
Convert a AssetConvertible or iterable of AssetConvertibles into
a list of Asset objects.
This method exists primarily as a convenience for implementing
user-facing APIs that can handle multiple kinds of input. It should
not be used for internal code where we already know the expected types
of our inputs.
Returns a pair of objects, the first of which is the result of the
conversion, and the second of which is a list containing any values
that couldn't be resolved.
"""
matches = []
missing = []
# Interpret input as scalar.
if isinstance(asset_convertible_or_iterable, AssetConvertible):
self._lookup_generic_scalar(
asset_convertible=asset_convertible_or_iterable,
as_of_date=as_of_date,
matches=matches,
missing=missing,
)
try:
return matches[0], missing
except IndexError:
if hasattr(asset_convertible_or_iterable, '__int__'):
raise SidNotFound(sid=asset_convertible_or_iterable)
else:
raise SymbolNotFound(symbol=asset_convertible_or_iterable)
# Interpret input as iterable.
try:
iterator = iter(asset_convertible_or_iterable)
except TypeError:
raise NotAssetConvertible(
"Input was not a AssetConvertible "
"or iterable of AssetConvertible."
)
for obj in iterator:
self._lookup_generic_scalar(obj, as_of_date, matches, missing)
return matches, missing
def insert_metadata(self, identifier, **kwargs):
"""
Inserts the given metadata kwargs to the entry for the given
identifier. Matching fields in the existing entry will be overwritten.
:param identifier: The identifier for which to insert metadata
:param kwargs: The keyed metadata to insert
"""
entry = self.metadata_cache.get(identifier, {})
for key, value in kwargs.items():
# Do not accept invalid fields
if key not in ASSET_FIELDS:
continue
# Do not accept Nones
if value is None:
continue
# Do not accept nans from dataframes
if isinstance(value, float) and np.isnan(value):
continue
entry[key] = value
self.metadata_cache[identifier] = entry
def consume_identifiers(self, identifiers):
for identifier in identifiers:
self.insert_metadata(identifier)
def consume_metadata(self, metadata):
"""
Consumes the provided metadata in to the metadata cache. The
existing values in the cache will be overwritten when there
is a conflict.
:param metadata: The metadata to be consumed
"""
# Handle dicts
if isinstance(metadata, dict):
self._insert_metadata_dict(metadata)
# Handle DataFrames
elif isinstance(metadata, pd.DataFrame):
self._insert_metadata_dataframe(metadata)
# Handle readables
elif hasattr(metadata, 'read'):
self._insert_metadata_readable(metadata)
else:
raise ConsumeAssetMetaDataError(obj=metadata)
def clear_metadata(self):
self.metadata_cache = {}
def _insert_metadata_dataframe(self, dataframe):
for identifier, row in dataframe.iterrows():
self.insert_metadata(identifier, **row)
def _insert_metadata_dict(self, dict):
for identifier, entry in dict.items():
self.insert_metadata(identifier, **entry)
def _insert_metadata_readable(self, readable):
for row in readable.read():
# Parse out the row of the readable object
metadata_dict = {}
for field in ASSET_FIELDS:
try:
row_value = row[field]
# Avoid passing placeholders
if row_value and (row_value is not 'None'):
metadata_dict[field] = row[field]
except KeyError:
continue
except IndexError:
continue
# Locate the identifier, fail if not found
if 'sid' in metadata_dict:
identifier = metadata_dict['sid']
elif 'symbol' in metadata_dict:
identifier = metadata_dict['symbol']
else:
raise ConsumeAssetMetaDataError(obj=row)
self.insert_metadata(identifier, **metadata_dict)
class AssetConvertible(with_metaclass(ABCMeta)):
"""
ABC for types that are convertible to integer-representations of
Assets.
Includes Asset, six.string_types, and Integral
"""
pass
AssetConvertible.register(Integral)
AssetConvertible.register(Asset)
# Use six.string_types for Python2/3 compatibility
for type in string_types:
AssetConvertible.register(type)
class NotAssetConvertible(ValueError):
pass