From 035bfbd514cde8260c70ba09bbaf024846202ce6 Mon Sep 17 00:00:00 2001 From: jfkirk Date: Mon, 8 Jun 2015 15:53:48 -0400 Subject: [PATCH] ENH: Adds AssetFinder and asset metadata management --- tests/test_assets.py | 500 +++++++++++++++++++++++++++++++++- zipline/assets/__init__.py | 22 ++ zipline/assets/_assets.pyx | 86 ++++-- zipline/assets/assets.py | 541 +++++++++++++++++++++++++++++++++++++ 4 files changed, 1115 insertions(+), 34 deletions(-) create mode 100644 zipline/assets/assets.py diff --git a/tests/test_assets.py b/tests/test_assets.py index fea726ab..8fd41cc7 100644 --- a/tests/test_assets.py +++ b/tests/test_assets.py @@ -20,7 +20,177 @@ Tests for the zipline.assets package import sys from unittest import TestCase -from zipline.assets._assets import Asset, Future +from datetime import ( + datetime, + timedelta, +) +import pickle +import pprint +import pytz +import uuid + +import pandas as pd + +from nose_parameterized import parameterized + +from zipline.finance.trading import with_environment +from zipline.assets import Asset, Future, AssetFinder +from zipline.errors import ( + SymbolNotFound, + MultipleSymbolsFound, +) + + +class FakeTable(object): + def __init__(self, name, count, dt, fuzzy_str): + self.name = name + self.count = count + self.dt = dt + self.fuzzy_str = fuzzy_str + self.df = pd.DataFrame.from_records( + [ + { + 'sid': i, + 'file_name': 'TEST%s%s' % (self.fuzzy_str, i), + 'company_name': self.name + str(i), + 'start_date_nano': pd.Timestamp(dt, tz='UTC').value, + 'end_date_nano': pd.Timestamp(dt, tz='UTC').value, + 'exchange': self.name, + } + for i in range(1, self.count + 1) + ] + ) + + def read(self, *args, **kwargs): + return self.df.to_records() + + +class FakeTableIdenticalSymbols(object): + def __init__(self, name, as_of_dates): + self.name = name + self.as_of_dates = as_of_dates + self.df = pd.DataFrame.from_records( + [ + { + 'sid': i, + 'file_name': self.name, + 'company_name': self.name, + 'start_date_nano': date.value, + 'end_date_nano': (date + timedelta(days=1)).value, + 'exchange': self.name, + } + for i, date in enumerate(self.as_of_dates) + ] + ) + + def read(self, *args, **kwargs): + return self.df.to_records() + + +class FakeTableFromRecords(object): + + def __init__(self, records): + self.records = records + self.df = pd.DataFrame.from_records(self.records) + + def read(self, *args, **kwargs): + return self.df.to_records() + + +@with_environment() +def build_lookup_generic_cases(env=None): + """ + Generate test cases for AssetFinder test_lookup_generic. + """ + + unique_start = pd.Timestamp('2013-01-01', tz='UTC') + unique_end = pd.Timestamp('2014-01-01', tz='UTC') + + dupe_0_start = pd.Timestamp('2013-01-01', tz='UTC') + dupe_0_end = dupe_0_start + timedelta(days=1) + + dupe_1_start = pd.Timestamp('2013-01-03', tz='UTC') + dupe_1_end = dupe_1_start + timedelta(days=1) + + table = FakeTableFromRecords( + [ + { + 'sid': 0, + 'file_name': 'duplicated', + 'company_name': 'duplicated_0', + 'start_date_nano': dupe_0_start.value, + 'end_date_nano': dupe_0_end.value, + 'exchange': '', + }, + { + 'sid': 1, + 'file_name': 'duplicated', + 'company_name': 'duplicated_1', + 'start_date_nano': dupe_1_start.value, + 'end_date_nano': dupe_1_end.value, + 'exchange': '', + }, + { + 'sid': 2, + 'file_name': 'unique', + 'company_name': 'unique', + 'start_date_nano': unique_start.value, + 'end_date_nano': unique_end.value, + 'exchange': '', + }, + ], + ) + env.update_asset_finder(asset_metadata=table.df) + dupe_0, dupe_1, unique = assets = [ + env.asset_finder.retrieve_asset(i) + for i in range(3) + ] + + # This expansion code is run at module import time, which means we have to + # clear the AssetFinder here or else it will interfere with the cache + # for other tests. + env.update_asset_finder(clear_metadata=True) + + dupe_0_start = dupe_0.start_date + dupe_1_start = dupe_1.start_date + cases = [ + ## + # Scalars + + # Asset object + (table, assets[0], None, assets[0]), + (table, assets[1], None, assets[1]), + (table, assets[2], None, assets[2]), + # int + (table, 0, None, assets[0]), + (table, 1, None, assets[1]), + (table, 2, None, assets[2]), + # Duplicated symbol with resolution date + (table, 'duplicated', dupe_0_start, dupe_0), + (table, 'duplicated', dupe_1_start, dupe_1), + # Unique symbol, with or without resolution date. + (table, 'unique', unique_start, unique), + (table, 'unique', None, unique), + + ## + # Iterables + + # Iterables of Asset objects. + (table, assets, None, assets), + (table, iter(assets), None, assets), + # Iterables of ints + (table, (0, 1), None, assets[:-1]), + (table, iter((0, 1)), None, assets[:-1]), + # Iterables of symbols. + (table, ('duplicated', 'unique'), dupe_0_start, [dupe_0, unique]), + (table, ('duplicated', 'unique'), dupe_1_start, [dupe_1, unique]), + # Mixed types + (table, + ('duplicated', 2, 'unique', 1, dupe_1), + dupe_0_start, + [dupe_0, assets[2], unique, assets[1], dupe_1]), + ] + return cases class AssetTestCase(TestCase): @@ -35,8 +205,53 @@ class AssetTestCase(TestCase): self.assertEquals(str(Asset(5061)), 'Asset(5061)') + def test_asset_is_pickleable(self): -class TestAssetRichCmp(TestCase): + # Very wow + s = Asset( + 1337, + symbol="DOGE", + asset_name="DOGECOIN", + start_date=pd.Timestamp('2013-12-08 9:31AM', tz='UTC'), + end_date=pd.Timestamp('2014-06-25 11:21AM', tz='UTC'), + first_traded=pd.Timestamp('2013-12-08 9:31AM', tz='UTC'), + exchange='THE MOON', + ) + s_unpickled = pickle.loads(pickle.dumps(s)) + + attrs_to_check = ['end_date', + 'exchange', + 'first_traded', + 'asset_end_date', + 'asset_name', + 'asset_start_date', + 'sid', + 'start_date', + 'symbol'] + + for attr in attrs_to_check: + self.assertEqual(getattr(s, attr), getattr(s_unpickled, attr)) + + def test_asset_comparisons(self): + + s_23 = Asset(23) + s_24 = Asset(24) + + self.assertEqual(s_23, s_23) + self.assertEqual(s_23, 23) + self.assertEqual(23, s_23) + + self.assertNotEqual(s_23, s_24) + self.assertNotEqual(s_23, 24) + self.assertNotEqual(s_23, "23") + self.assertNotEqual(s_23, 23.5) + self.assertNotEqual(s_23, []) + self.assertNotEqual(s_23, None) + + self.assertLess(s_23, s_24) + self.assertLess(s_23, 24) + self.assertGreater(24, s_23) + self.assertGreater(s_24, s_23) def test_lt(self): self.assertTrue(Asset(3) < Asset(4)) @@ -74,16 +289,279 @@ class TestAssetRichCmp(TestCase): 'a' < Asset(3) -class testFuture(TestCase): +class TestFuture(TestCase): + + future = Future(2468, + symbol='OMK15', + notice_date='2014-01-20', + expiration_date='2014-02-20', + contract_multiplier=500) + + def test_str(self): + strd = self.future.__str__() + self.assertEqual("Future(2468 [OMK15])", strd) def test_repr(self): + reprd = self.future.__repr__() + self.assertTrue("Future" in reprd) + self.assertTrue("2468" in reprd) + self.assertTrue("OMK15" in reprd) + self.assertTrue("notice_date='2014-01-20'" in reprd) + self.assertTrue("expiration_date='2014-02-20'" in reprd) + self.assertTrue("contract_multiplier=500" in reprd) - future = Future(2468, - notice_date='2014-01-20', - expiration_date='2014-02-20') - rep = future.__repr__() + def test_reduce(self): + reduced = self.future.__reduce__() + self.assertEqual(Future, reduced[0]) - self.assertTrue("Future" in rep) - self.assertTrue("2468" in rep) - self.assertTrue("notice_date='2014-01-20'" in rep) - self.assertTrue("expiration_date='2014-02-20'" in rep) + def test_to_and_from_dict(self): + dictd = self.future.to_dict() + self.assertTrue('notice_date' in dictd) + self.assertTrue('expiration_date' in dictd) + self.assertTrue('contract_multiplier' in dictd) + + from_dict = Future.from_dict(dictd) + self.assertTrue(isinstance(from_dict, Future)) + self.assertEqual(self.future, from_dict) + + +class AssetFinderTestCase(TestCase): + + @with_environment() + def test_lookup_symbol_fuzzy(self, env=None): + fuzzy_str = '@' + as_of_date = datetime(2013, 1, 1, tzinfo=pytz.utc) + table = FakeTable(uuid.uuid4().hex, 2, as_of_date, + fuzzy_str) + env.update_asset_finder(asset_metadata=table.df) + sf = env.asset_finder + + try: + for i in range(2): # we do it twice to test for caching bugs + self.assertIsNone(sf.lookup_symbol('test', as_of_date)) + self.assertIsNotNone(sf.lookup_symbol( + 'test%s%s' % (fuzzy_str, 1), as_of_date)) + self.assertIsNone(sf.lookup_symbol('test%s' % 1, as_of_date)) + + self.assertIsNone(sf.lookup_symbol(table.name, as_of_date, + fuzzy=fuzzy_str)) + self.assertIsNotNone(sf.lookup_symbol( + 'test%s%s' % (fuzzy_str, 1), as_of_date, fuzzy=fuzzy_str)) + self.assertIsNotNone(sf.lookup_symbol( + 'test%s' % 1, as_of_date, fuzzy=fuzzy_str)) + finally: + env.update_asset_finder(clear_metadata=True) + + @with_environment() + def test_lookup_symbol_resolve_multiple(self, env=None): + + as_of_dates = [ + pd.Timestamp('2013-01-01', tz='UTC') + timedelta(days=i) + # Incrementing by two so that start and end dates for each + # generated Asset don't overlap (each Asset's end_date is the + # day after its start date.) + for i in range(0, 10, 2) + ] + + table = FakeTableIdenticalSymbols( + name='existing', + as_of_dates=as_of_dates, + ) + env.update_asset_finder(asset_metadata=table.df) + sf = env.asset_finder + + try: + for _ in range(2): # we do it twice to test for caching bugs + with self.assertRaises(SymbolNotFound): + sf.lookup_symbol_resolve_multiple('non_existing', + as_of_dates[0]) + with self.assertRaises(MultipleSymbolsFound): + sf.lookup_symbol_resolve_multiple('existing', + None) + + for i, date in enumerate(as_of_dates): + # Verify that we correctly resolve multiple symbols using + # the supplied date + result = sf.lookup_symbol_resolve_multiple( + 'existing', + date, + ) + self.assertEqual(result.symbol, 'existing') + self.assertEqual(result.sid, i) + + finally: + env.update_asset_finder(clear_metadata=True) + + @with_environment() + def test_lookup_symbol_nasdaq_underscore_collisions(self, env=None): + """ + Ensure that each NASDAQ symbol without underscores maps back to the + original symbol when using fuzzy matching. + """ + sf = env.asset_finder + fuzzy_str = '_' + collisions = [] + + try: + for sid in sf.sids: + sec = sf.retrieve_asset(sid) + if sec.exchange.startswith('NASDAQ'): + found = sf.lookup_symbol(sec.symbol.replace(fuzzy_str, ''), + sec.end_date, fuzzy=fuzzy_str) + if found != sec: + collisions.append((found, sec)) + + # KNOWN BUG: Filter out assets that have intersections in their + # start and end dates. We can't correctly resolve these. + unexpected_errors = [] + for first, second in collisions: + overlapping_dates = ( + first.end_date >= second.start_date or + second.end_date >= first.end_date + ) + if not overlapping_dates: + unexpected_errors.append((first, second)) + + self.assertFalse( + unexpected_errors, + pprint.pformat(unexpected_errors), + ) + finally: + env.update_asset_finder(clear_metadata=True) + + @parameterized.expand( + build_lookup_generic_cases() + ) + @with_environment() + def test_lookup_generic(self, table, symbols, reference_date, expected, + env=None): + """ + Ensure that lookup_generic works with various permutations of inputs. + """ + try: + env.update_asset_finder(asset_metadata=table.df) + finder = env.asset_finder + results, missing = finder.lookup_generic(symbols, reference_date) + self.assertEqual(results, expected) + self.assertEqual(missing, []) + finally: + env.update_asset_finder(clear_metadata=True) + + @with_environment() + def test_lookup_generic_handle_missing(self, env=None): + try: + table = FakeTableFromRecords( + [ + # Sids that will be found when we do lookups. + { + 'sid': 0, + 'file_name': 'real', + 'company_name': 'real', + 'start_date_nano': pd.Timestamp('2013-1-1', tz='UTC'), + 'end_date_nano': pd.Timestamp('2014-1-1', tz='UTC'), + 'exchange': '', + }, + { + 'sid': 1, + 'file_name': 'also_real', + 'company_name': 'also_real', + 'start_date_nano': pd.Timestamp('2013-1-1', tz='UTC'), + 'end_date_nano': pd.Timestamp('2014-1-1', tz='UTC'), + 'exchange': '', + }, + # Sid whose end date is before our query date. We should + # still correctly find it. + { + 'sid': 2, + 'file_name': 'real_but_old', + 'company_name': 'real_but_old', + 'start_date_nano': pd.Timestamp('2002-1-1', tz='UTC'), + 'end_date_nano': pd.Timestamp('2003-1-1', tz='UTC'), + 'exchange': '', + }, + # Sid whose end date is before our query date. We should + # still correctly find it. + { + 'sid': 3, + 'file_name': 'real_but_in_the_future', + 'company_name': 'real_but_in_the_future', + 'start_date_nano': pd.Timestamp('2014-1-1', tz='UTC'), + 'end_date_nano': pd.Timestamp('2020-1-1', tz='UTC'), + 'exchange': 'THE FUTURE', + }, + ] + ) + env.update_asset_finder(asset_metadata=table.df) + symbols = [ + 'real', 1, 'fake', 'real_but_old', 'real_but_in_the_future', + ] + + results, missing = env.asset_finder.lookup_generic( + symbols, + pd.Timestamp('2013-02-01', tz='UTC'), + ) + + self.assertEqual(len(results), 3) + self.assertEqual(results[0].symbol, 'real') + self.assertEqual(results[0].sid, 0) + self.assertEqual(results[1].symbol, 'also_real') + self.assertEqual(results[1].sid, 1) + + self.assertEqual(len(missing), 2) + self.assertEqual(missing[0], 'fake') + self.assertEqual(missing[1], 'real_but_in_the_future') + + finally: + env.update_asset_finder(clear_metadata=True) + + def test_insert_metadata(self): + finder = AssetFinder() + finder.insert_metadata(0, + asset_type='equity', + start_date='2014-01-01', + end_date='2015-01-01', + symbol="PLAY", + foo_data="FOO",) + + # Test proper insertion + self.assertEqual('equity', finder.metadata_cache[0]['asset_type']) + self.assertEqual('PLAY', finder.metadata_cache[0]['symbol']) + self.assertEqual('2015-01-01', finder.metadata_cache[0]['end_date']) + + # Test invalid field + self.assertFalse('foo_data' in finder.metadata_cache[0]) + + # Test updating fields + finder.insert_metadata(0, + asset_type='equity', + start_date='2014-01-01', + end_date='2015-02-01', + symbol="PLAY", + exchange="NYSE",) + self.assertEqual('2015-02-01', finder.metadata_cache[0]['end_date']) + self.assertEqual('NYSE', finder.metadata_cache[0]['exchange']) + + # Check that old data survived + self.assertEqual('PLAY', finder.metadata_cache[0]['symbol']) + + def test_consume_metadata(self): + + # Test dict consumption + finder = AssetFinder({0: {'asset_type': 'equity'}}) + dict_to_consume = {0: {'symbol': 'PLAY'}, + 1: {'symbol': 'MSFT'}} + finder.consume_metadata(dict_to_consume) + self.assertEqual('equity', finder.metadata_cache[0]['asset_type']) + self.assertEqual('PLAY', finder.metadata_cache[0]['symbol']) + + # Test dataframe consumption + df = pd.DataFrame(columns=['asset_name', 'exchange'], index=[0, 1]) + df['asset_name'][0] = "Dave'N'Busters" + df['exchange'][0] = "NASDAQ" + df['asset_name'][1] = "Microsoft" + df['exchange'][1] = "NYSE" + finder.consume_metadata(df) + self.assertEqual('NASDAQ', finder.metadata_cache[0]['exchange']) + self.assertEqual('Microsoft', finder.metadata_cache[1]['asset_name']) + # Check that old data survived + self.assertEqual('equity', finder.metadata_cache[0]['asset_type']) diff --git a/zipline/assets/__init__.py b/zipline/assets/__init__.py index 73d0bbd1..b088688e 100644 --- a/zipline/assets/__init__.py +++ b/zipline/assets/__init__.py @@ -12,3 +12,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from ._assets import ( + Asset, + Equity, + Future, + make_asset_array, + CACHE_FILE_TEMPLATE +) +from .assets import ( + AssetFinder, + AssetConvertible +) + +__all__ = [ + 'Asset', + 'Equity', + 'Future', + 'AssetFinder', + 'AssetConvertible', + 'make_asset_array', + 'CACHE_FILE_TEMPLATE' +] diff --git a/zipline/assets/_assets.pyx b/zipline/assets/_assets.pyx index 16b8ba06..89d6761b 100644 --- a/zipline/assets/_assets.pyx +++ b/zipline/assets/_assets.pyx @@ -21,9 +21,10 @@ cimport cython import numpy as np cimport numpy as np -cdef enum AssetType: - EQUITY = 1 - FUTURE = 2 +# IMPORTANT NOTE: You must change this template if you change +# Asset.__reduce__, or else we'll attempt to unpickle an old version of this +# class +CACHE_FILE_TEMPLATE = '/tmp/.%s-%s.v4.cache' cdef class Asset: @@ -33,9 +34,7 @@ cdef class Asset: cdef readonly object symbol cdef readonly object asset_name - cdef readonly AssetType asset_type - # TODO: Maybe declare as pandas Timestamp? cdef readonly object start_date cdef readonly object end_date cdef public object first_traded @@ -179,26 +178,21 @@ cdef class Asset: 'exchange': self.exchange, } - @staticmethod - def from_dict(dict_): + @classmethod + def from_dict(cls, dict_): """ Build an Asset instance from a dict. """ - return Asset(**dict_) + return cls(**dict_) cdef class Equity(Asset): - def __cinit__(self, - int sid, # sid is required - object symbol="", - object asset_name="", - object start_date=None, - object end_date=None, - object first_traded=None, - object exchange=""): - - self.asset_type = EQUITY + def __str__(self): + if self.symbol: + return 'Equity(%d [%s])' % (self.sid, self.symbol) + else: + return 'Equity(%d)' % self.sid def __repr__(self): attrs = ('symbol', 'asset_name', 'exchange', @@ -214,6 +208,7 @@ cdef class Future(Asset): cdef readonly object notice_date cdef readonly object expiration_date + cdef readonly int contract_multiplier def __cinit__(self, int sid, # sid is required @@ -224,18 +219,63 @@ cdef class Future(Asset): object notice_date=None, object expiration_date=None, object first_traded=None, - object exchange=""): + object exchange="", + int contract_multiplier=1): - self.asset_type = FUTURE - self.notice_date = notice_date - self.expiration_date = expiration_date + self.notice_date = notice_date + self.expiration_date = expiration_date + self.contract_multiplier = contract_multiplier + + # Assign the expiration as the end_date if end_date is not explicit + if self.end_date is None: + self.end_date = expiration_date + + def __str__(self): + if self.symbol: + return 'Future(%d [%s])' % (self.sid, self.symbol) + else: + return 'Future(%d)' % self.sid def __repr__(self): attrs = ('symbol', 'asset_name', 'exchange', 'start_date', 'end_date', 'first_traded', 'notice_date', - 'expiration_date') + 'expiration_date', 'contract_multiplier') tuples = ((attr, repr(getattr(self, attr, None))) for attr in attrs) strings = ('%s=%s' % (t[0], t[1]) for t in tuples) params = ', '.join(strings) return 'Future(%d, %s)' % (self.sid, params) + + cpdef __reduce__(self): + """ + Function used by pickle to determine how to serialize/deserialize this + class. Should return a tuple whose first element is self.__class__, + and whose second element is a tuple of all the attributes that should + be serialized/deserialized during pickling. + """ + return (self.__class__, (self.sid, + self.symbol, + self.asset_name, + self.start_date, + self.end_date, + self.notice_date, + self.expiration_date, + self.first_traded, + self.exchange, + self.contract_multiplier,)) + + cpdef to_dict(self): + """ + Convert to a python dict. + """ + super_dict = super(Future, self).to_dict() + super_dict['notice_date'] = self.notice_date + super_dict['expiration_date'] = self.expiration_date + super_dict['contract_multiplier'] = self.contract_multiplier + return super_dict + + +def make_asset_array(int size, Asset asset): + cdef np.ndarray out = np.empty([size], dtype=object) + out.fill(asset) + return out diff --git a/zipline/assets/assets.py b/zipline/assets/assets.py new file mode 100644 index 00000000..8cc234b3 --- /dev/null +++ b/zipline/assets/assets.py @@ -0,0 +1,541 @@ +# +# Copyright 2015 Quantopian, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABCMeta +from itertools import chain +from numbers import Integral +import numpy as np +import operator + +from logbook import Logger +import pandas as pd +from pandas.tseries.tools import normalize_date +from six import with_metaclass, string_types + +from zipline.errors import ( + SymbolNotFound, + MultipleSymbolsFound, + SidNotFound, + IdentifierNotFound, + ConsumeAssetMetaDataError, + InvalidAssetType, +) +from zipline.utils import tradingcalendar +from zipline.assets._assets import ( + Asset, Equity, Future +) + +log = Logger('assets.py') + +# Expected fields for an Asset's metadata +ASSET_FIELDS = [ + 'sid', + 'asset_type', + 'symbol', + 'asset_name', + 'start_date', + 'end_date', + 'first_traded', + 'exchange', + 'notice_date', + 'expiration_date', + 'contract_multiplier', + # The following fields are for compatibility with other systems + 'file_name', # Used as symbol + 'company_name', # Used as asset_name + 'start_date_nano', # Used as start_date + 'end_date_nano', # Used as end_date +] + + +class AssetFinder(object): + + def __init__(self, + metadata=None, + trading_calendar=tradingcalendar): + + # Any particular instance of AssetFinder should be + # consistent throughout its lifetime, so we grab a reference + # to our cache now. That way, if the cache is refreshed later, + # our instance will continue to use the old one. + self.cache = {} + self.sym_cache = {} + self.identifier_cache = {} + self.fuzzy_match = {} + + # The AssetFinder also holds a nested-dict of all metadata for + # reference when building Assets + self.metadata_cache = {} + if metadata: + self.consume_metadata(metadata) + + self.trading_calendar = trading_calendar + self.populate_cache() + + def _next_free_sid(self): + if len(self.cache) > 0: + return max(self.cache.keys()) + 1 + return 0 + + def _assign_sid(self, identifier): + if hasattr(identifier, '__int__'): + return identifier.__int__() + if isinstance(identifier, string_types): + return self._next_free_sid() + + def retrieve_asset(self, sid, default_none=False): + if isinstance(sid, Asset): + return sid + asset = self.cache.get(sid) + if asset is not None: + return asset + elif default_none: + return None + else: + raise SidNotFound(sid=sid) + + def retrieve_asset_by_identifier(self, identifier): + if isinstance(identifier, Asset): + return identifier + asset = self.identifier_cache.get(identifier) + if asset is not None: + return asset + else: + raise IdentifierNotFound(identifier=identifier) + + @staticmethod + def _lookup_symbol_in_infos(infos, as_of_date): + """ + Search a list of symbols matching a given asset for the most recent + known symbol as of as_of_date. + + Returns a pair of (Asset, bool), representing the best match we + found for as_of_date, and whether or not that match was actually + trading at as_of_date. + + If no entry in infos started before as_of_date, return (None, False). + """ + # Sort entries by end_date before iterating. If asset start and end + # dates were always disjoint, then we could sort by either start or + # end_date and get the same sorting. + infos = sorted(infos, key=operator.attrgetter('end_date')) + + # Find the newest asset that started before as_of_date. + candidates = [i for i in infos + if (i.start_date is None or i.start_date <= as_of_date) + and (i.end_date is None or as_of_date <= i.end_date)] + + # If one SID exists for symbol, return that symbol + if len(candidates) == 1: + return candidates[0], True + + # If no SID exists for symbol, return SID with the + # highest-but-not-over end_date + if len(candidates) == 0: + candidates = [i for i in infos + if i.end_date < as_of_date] + return (candidates[-1], False) if candidates else (None, False) + + # If multiple SIDs exist for symbol, return latest start_date with + # end_date as a tie-breaker + if len(candidates) > 1: + best_candidate = sorted( + candidates, + key=lambda x: (x.start_date, x.end_date) + )[-1] + return best_candidate, True + + def lookup_symbol_resolve_multiple(self, symbol, as_of_date=None): + """ + Return matching Asset of name symbol in database. + + If multiple Assets are found and as_of_date is not set, + raises MultipleSymbolsFound. + + If no Asset was active at as_of_date, and allow_expired is False + raises SymbolNotFound. + """ + if as_of_date is not None: + as_of_date = normalize_date(as_of_date) + + if symbol not in self.sym_cache: + raise SymbolNotFound(symbol=symbol) + + infos = self.sym_cache[symbol] + if as_of_date is None: + if len(infos) == 1: + return infos[0] + else: + raise MultipleSymbolsFound(symbol=symbol, + options=str(infos)) + + # Try to find symbol matching as_of_date + asset, _ = self._lookup_symbol_in_infos(infos, as_of_date) + if asset is None: + raise SymbolNotFound(symbol=symbol) + return asset + + def lookup_symbol(self, symbol, as_of_date, fuzzy=None): + """ + If a fuzzy string is provided, then we try various symbols based on + the provided symbol. This is to facilitate mapping from a broker's + symbol to ours in cases where mapping to the broker's symbol loses + information. For example, if we have CMCS_A, but a broker has CMCSA, + when the broker provides CMCSA, it can also provide fuzzy='_', + so we can find a match by inserting an underscore. + """ + symbol = symbol.upper() + as_of_date = normalize_date(as_of_date) + + if not fuzzy: + try: + return self.lookup_symbol_resolve_multiple(symbol, as_of_date) + except SymbolNotFound: + return None + else: + try: + return self.fuzzy_match[(symbol, fuzzy, as_of_date)] + except KeyError: + # if symbol is CMCSA and fuzzy is '_', then + # try CMCSA, then CMCS_A, then CMC_SA, etc. + for fuzzy_symbol in chain( + (symbol,), + (symbol[:i] + fuzzy + symbol[i:] + for i in range(len(symbol) - 1, 0, -1))): + + infos = self.sym_cache.get(fuzzy_symbol) + if infos: + info, date_match = self._lookup_symbol_in_infos( + infos, + as_of_date, + ) + + if info is not None and date_match: + self.fuzzy_match[(symbol, fuzzy, as_of_date)] = \ + info + return info + else: + self.fuzzy_match[(symbol, fuzzy, as_of_date)] = None + + def populate_cache(self): + """ + Populates the asset cache with all values in the assets + collection. + """ + + # Wipe caches before repopulating + self.cache = {} + self.sym_cache = {} + self.identifier_cache = {} + self.fuzzy_match = {} + + counter = 0 + for identifier, row in self.metadata_cache.items(): + self.spawn_asset(identifier=identifier, **row) + counter += 1 + + def spawn_asset(self, identifier, **kwargs): + + # Check if the sid is declared + try: + kwargs['sid'] + pass + except KeyError: + # Assign the identifier as the sid, if applicable + if isinstance(identifier, int): + kwargs['sid'] = identifier + # If the identifier is not a sid, assign one + else: + kwargs['sid'] = self._assign_sid(identifier) + # Update the metadata object with the new sid + self.insert_metadata(identifier=identifier, sid=kwargs['sid']) + + # If the file_name is in the kwargs, it may be the symbol + try: + file_name = kwargs.pop('file_name') + try: + kwargs['symbol'] + except KeyError: + kwargs['symbol'] = file_name + except KeyError: + pass + + # If the identifier coming in was a string and there is no defined + # symbol yet, set the symbol to the incoming identifier + try: + kwargs['symbol'] + pass + except KeyError: + if isinstance(identifier, string_types): + kwargs['symbol'] = identifier + + # If the company_name is in the kwargs, it may be the asset_name + try: + company_name = kwargs.pop('company_name') + try: + kwargs['asset_name'] + except KeyError: + kwargs['asset_name'] = company_name + except KeyError: + pass + + # If dates are given as nanos, pop them + try: + kwargs['start_date'] = kwargs.pop('start_date_nano') + except KeyError: + pass + try: + kwargs['end_date'] = kwargs.pop('end_date_nano') + except KeyError: + pass + try: + kwargs['notice_date'] = kwargs.pop('notice_date_nano') + except KeyError: + pass + try: + kwargs['expiration_date'] = kwargs.pop('expiration_date_nano') + except KeyError: + pass + + # Process dates to Timestamps + try: + kwargs['start_date'] = pd.Timestamp(kwargs['start_date'], tz='UTC') + except KeyError: + pass + try: + kwargs['end_date'] = pd.Timestamp(kwargs['end_date'], tz='UTC') + except KeyError: + pass + try: + kwargs['notice_date'] = pd.Timestamp(kwargs['notice_date'], + tz='UTC') + except KeyError: + pass + try: + kwargs['expiration_date'] = pd.Timestamp(kwargs['expiration_date'], + tz='UTC') + except KeyError: + pass + + # Build an Asset of the appropriate type + asset_type = kwargs.pop('asset_type', 'equity') + if asset_type.lower() == 'equity': + asset = Equity(**kwargs) + elif asset_type.lower() == 'future': + asset = Future(**kwargs) + else: + raise InvalidAssetType(asset_type=asset_type) + + self.cache[asset.sid] = asset + self.identifier_cache[identifier] = asset + if asset.symbol is not "": + self.sym_cache.setdefault(asset.symbol, []).append(asset) + + return asset + + @property + def sids(self): + return self.cache.keys() + + @property + def assets(self): + return self.cache.values() + + def _lookup_generic_scalar(self, + asset_convertible, + as_of_date, + matches, + missing): + """ + Convert asset_convertible to an asset. + + On success, append to matches. + On failure, append to missing. + """ + try: + if isinstance(asset_convertible, Asset): + matches.append(asset_convertible) + + elif isinstance(asset_convertible, Integral): + result = self.retrieve_asset(int(asset_convertible)) + if result is None: + raise SymbolNotFound(symbol=asset_convertible) + matches.append(result) + + elif isinstance(asset_convertible, string_types): + # Throws SymbolNotFound on failure to match. + matches.append( + self.lookup_symbol_resolve_multiple( + asset_convertible, + as_of_date, + ) + ) + else: + raise NotAssetConvertible( + "Input was %s, not AssetConvertible." + % asset_convertible + ) + + except SymbolNotFound: + missing.append(asset_convertible) + return None + + def lookup_generic(self, + asset_convertible_or_iterable, + as_of_date): + """ + Convert a AssetConvertible or iterable of AssetConvertibles into + a list of Asset objects. + + This method exists primarily as a convenience for implementing + user-facing APIs that can handle multiple kinds of input. It should + not be used for internal code where we already know the expected types + of our inputs. + + Returns a pair of objects, the first of which is the result of the + conversion, and the second of which is a list containing any values + that couldn't be resolved. + """ + matches = [] + missing = [] + + # Interpret input as scalar. + if isinstance(asset_convertible_or_iterable, AssetConvertible): + self._lookup_generic_scalar( + asset_convertible=asset_convertible_or_iterable, + as_of_date=as_of_date, + matches=matches, + missing=missing, + ) + try: + return matches[0], missing + except IndexError: + if hasattr(asset_convertible_or_iterable, '__int__'): + raise SidNotFound(sid=asset_convertible_or_iterable) + else: + raise SymbolNotFound(symbol=asset_convertible_or_iterable) + + # Interpret input as iterable. + try: + iterator = iter(asset_convertible_or_iterable) + except TypeError: + raise NotAssetConvertible( + "Input was not a AssetConvertible " + "or iterable of AssetConvertible." + ) + + for obj in iterator: + self._lookup_generic_scalar(obj, as_of_date, matches, missing) + return matches, missing + + def insert_metadata(self, identifier, **kwargs): + """ + Inserts the given metadata kwargs to the entry for the given + identifier. Matching fields in the existing entry will be overwritten. + :param identifier: The identifier for which to insert metadata + :param kwargs: The keyed metadata to insert + """ + entry = self.metadata_cache.get(identifier, {}) + + for key, value in kwargs.items(): + # Do not accept invalid fields + if key not in ASSET_FIELDS: + continue + # Do not accept Nones + if value is None: + continue + # Do not accept nans from dataframes + if isinstance(value, float) and np.isnan(value): + continue + entry[key] = value + + self.metadata_cache[identifier] = entry + + def consume_identifiers(self, identifiers): + for identifier in identifiers: + self.insert_metadata(identifier) + + def consume_metadata(self, metadata): + """ + Consumes the provided metadata in to the metadata cache. The + existing values in the cache will be overwritten when there + is a conflict. + :param metadata: The metadata to be consumed + """ + # Handle dicts + if isinstance(metadata, dict): + self._insert_metadata_dict(metadata) + # Handle DataFrames + elif isinstance(metadata, pd.DataFrame): + self._insert_metadata_dataframe(metadata) + # Handle readables + elif hasattr(metadata, 'read'): + self._insert_metadata_readable(metadata) + else: + raise ConsumeAssetMetaDataError(obj=metadata) + + def clear_metadata(self): + self.metadata_cache = {} + + def _insert_metadata_dataframe(self, dataframe): + for identifier, row in dataframe.iterrows(): + self.insert_metadata(identifier, **row) + + def _insert_metadata_dict(self, dict): + for identifier, entry in dict.items(): + self.insert_metadata(identifier, **entry) + + def _insert_metadata_readable(self, readable): + for row in readable.read(): + # Parse out the row of the readable object + metadata_dict = {} + for field in ASSET_FIELDS: + try: + row_value = row[field] + # Avoid passing placeholders + if row_value and (row_value is not 'None'): + metadata_dict[field] = row[field] + except KeyError: + continue + except IndexError: + continue + # Locate the identifier, fail if not found + if 'sid' in metadata_dict: + identifier = metadata_dict['sid'] + elif 'symbol' in metadata_dict: + identifier = metadata_dict['symbol'] + else: + raise ConsumeAssetMetaDataError(obj=row) + self.insert_metadata(identifier, **metadata_dict) + + +class AssetConvertible(with_metaclass(ABCMeta)): + """ + ABC for types that are convertible to integer-representations of + Assets. + + Includes Asset, six.string_types, and Integral + """ + pass + +AssetConvertible.register(Integral) +AssetConvertible.register(Asset) +# Use six.string_types for Python2/3 compatibility +for type in string_types: + AssetConvertible.register(type) + + +class NotAssetConvertible(ValueError): + pass