diff --git a/tests/test_assets.py b/tests/test_assets.py index ebe11e37..01638e4a 100644 --- a/tests/test_assets.py +++ b/tests/test_assets.py @@ -292,12 +292,11 @@ class AssetFinderTestCase(TestCase): [ { 'sid': i, - 'symbol': 'TEST@%d' % i, + 'symbol': 'TEST.%d' % i, 'company_name': "company%d" % i, 'start_date': as_of.value, 'end_date': as_of.value, - 'exchange': uuid.uuid4().hex, - 'fuzzy': 'TEST%d' % i + 'exchange': uuid.uuid4().hex } for i in range(3) ] @@ -308,25 +307,26 @@ class AssetFinderTestCase(TestCase): finder.retrieve_asset(i) for i in range(3) ) - for i in range(2): # we do it twice to test for caching bugs + # we do it twice to catch caching bugs + for i in range(2): + # Shouldn't find this with no fuzzy_str passed. self.assertIsNone(finder.lookup_symbol('test', as_of)) + self.assertIsNone(finder.lookup_symbol('test1', as_of)) self.assertEqual( asset_1, - finder.lookup_symbol('test@1', as_of) + finder.lookup_symbol('test.1', as_of) ) # Adding an unnecessary fuzzy shouldn't matter. self.assertEqual( asset_1, - finder.lookup_symbol('test@1', as_of, fuzzy=True) + finder.lookup_symbol('test/1', as_of) ) - # Shouldn't find this with no fuzzy_str passed. - self.assertIsNone(finder.lookup_symbol('test1', as_of)) # Should find exact match. self.assertEqual( asset_1, - finder.lookup_symbol('test1', as_of, fuzzy=True), + finder.lookup_symbol('test-1', as_of), ) def test_lookup_symbol_resolve_multiple(self): diff --git a/zipline/assets/asset_writer.py b/zipline/assets/asset_writer.py index 2415fb6e..d21503e0 100644 --- a/zipline/assets/asset_writer.py +++ b/zipline/assets/asset_writer.py @@ -4,7 +4,9 @@ from abc import ( ) from collections import namedtuple +import re import pandas as pd +import numpy as np from six import with_metaclass import sqlalchemy as sa @@ -34,7 +36,10 @@ FUTURE_TABLE_FIELDS = ASSET_TABLE_FIELDS | { } # Expected fields for an Equity's metadata -EQUITY_TABLE_FIELDS = ASSET_TABLE_FIELDS +EQUITY_TABLE_FIELDS = ASSET_TABLE_FIELDS | { + 'company_symbol', + 'share_class_symbol', +} EXCHANGE_TABLE_FIELDS = frozenset({ 'exchange', @@ -88,6 +93,43 @@ _root_symbols_defaults = { 'exchange': None, } +# Fuzzy symbol delimiters that may break up a company symbol and share class +_fuzzy_symbol_delimiter_regex = r'[./\-_]' +_fuzzy_symbol_default_triggers = frozenset({np.nan, None, ''}) + + +def split_fuzzy_symbol(fuzzy_symbol): + """ + Takes in a symbol that may be fuzzy and splits it in to a company symbol + and share class symbol. + + Parameters + ---------- + fuzzy_symbol : str + The possibly-fuzzy symbol to be split + + Returns + ------- + ( str, str ) + A tuple of ( company_symbol, share_class_symbol ) + """ + # return blank strings for any bad fuzzy symbols, like NaN or None + if fuzzy_symbol in _fuzzy_symbol_default_triggers: + return ('', '') + + split_list = re.split(pattern=_fuzzy_symbol_delimiter_regex, + string=fuzzy_symbol, + maxsplit=1) + + # Break the list up in to its two components, the company symbol and the + # share class symbol + company_symbol = split_list[0] + if len(split_list) > 1: + share_class_symbol = split_list[1] + else: + share_class_symbol = '' + return (company_symbol, share_class_symbol) + def _generate_output_dataframe(data_subset, defaults): """ @@ -163,7 +205,6 @@ class AssetDBWriter(with_metaclass(ABCMeta)): """ def write_all(self, engine, - fuzzy_char=None, allow_sid_assignment=True, constraints=True): """ Write pre-supplied data to SQLite. @@ -172,8 +213,6 @@ class AssetDBWriter(with_metaclass(ABCMeta)): ---------- engine : Engine An SQLAlchemy engine to a SQL database. - fuzzy_char : str, optional - A string for use in fuzzy matching. allow_sid_assignment: bool, optional If True then the class can assign sids where necessary. constraints : bool, optional @@ -192,7 +231,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)): self._write_exchanges(data.exchanges, txn) self._write_root_symbols(data.root_symbols, txn) self._write_futures(data.futures, txn) - self._write_equities(data.equities, fuzzy_char, txn) + self._write_equities(data.equities, txn) def _write_exchanges(self, exchanges, bind=None): recs = exchanges.reset_index().rename_axis( @@ -222,11 +261,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)): self.asset_router.insert().values([(record['sid'], 'future')])\ .execute(bind=bind) - def _write_equities(self, equities, fuzzy_char, bind=None): - # Apply fuzzy matching. - if fuzzy_char: - equities['fuzzy'] = equities['symbol'].str.replace(fuzzy_char, '') - + def _write_equities(self, equities, bind=None): recs = equities.reset_index().rename_axis( {'index': 'sid'}, 1, @@ -258,6 +293,8 @@ class AssetDBWriter(with_metaclass(ABCMeta)): primary_key=constraints, ), sa.Column('symbol', sa.Text), + sa.Column('company_symbol', sa.Text), + sa.Column('share_class_symbol', sa.Text), sa.Column('asset_name', sa.Text), sa.Column('start_date', sa.Integer, default=0), sa.Column('end_date', sa.Integer), @@ -366,6 +403,15 @@ class AssetDBWriter(with_metaclass(ABCMeta)): defaults=_equities_defaults, ) + # Split symbols to company_symbols and share_class_symbols + tuple_series = equities_output['symbol'].apply(split_fuzzy_symbol) + split_symbols = pd.DataFrame( + tuple_series.tolist(), + columns=['company_symbol', 'share_class_symbol'], + index=tuple_series.index + ) + equities_output = equities_output.join(split_symbols) + # Convert date columns to UNIX Epoch integers (nanoseconds) equities_output['start_date'] = \ equities_output['start_date'].apply(self.convert_datetime) diff --git a/zipline/assets/assets.py b/zipline/assets/assets.py index 8e9ab400..3b44b1b2 100644 --- a/zipline/assets/assets.py +++ b/zipline/assets/assets.py @@ -39,6 +39,7 @@ from zipline.assets import ( from zipline.assets.asset_writer import ( FUTURE_TABLE_FIELDS, EQUITY_TABLE_FIELDS, + split_fuzzy_symbol, ) log = Logger('assets.py') @@ -273,13 +274,16 @@ class AssetFinder(object): if as_of_date is not None: as_of_date = pd.Timestamp(normalize_date(as_of_date)) + company_symbol, share_class_symbol = split_fuzzy_symbol(symbol) + equities_cols = self.equities.c if as_of_date: ad_value = as_of_date.value # If one SID exists for symbol, return that symbol candidates = sa.select((equities_cols.sid,)).where( - (equities_cols.symbol == symbol) & + (equities_cols.company_symbol == company_symbol) & + (equities_cols.share_class_symbol == share_class_symbol) & (equities_cols.start_date <= ad_value) & (equities_cols.end_date >= ad_value), ).execute().fetchall() @@ -290,7 +294,8 @@ class AssetFinder(object): # highest-but-not-over end_date elif not candidates: sid = sa.select((equities_cols.sid,)).where( - (equities_cols.symbol == symbol) & + (equities_cols.company_symbol == company_symbol) & + (equities_cols.share_class_symbol == share_class_symbol) & (equities_cols.start_date <= ad_value), ).order_by( equities_cols.end_date.desc(), @@ -302,7 +307,8 @@ class AssetFinder(object): # end_date as a tie-breaker elif len(candidates) > 1: sid = sa.select((equities_cols.sid,)).where( - (equities_cols.symbol == symbol) & + (equities_cols.company_symbol == company_symbol) & + (equities_cols.share_class_symbol == share_class_symbol) & (equities_cols.start_date <= ad_value), ).order_by( equities_cols.start_date.desc(), @@ -315,7 +321,8 @@ class AssetFinder(object): else: sids = sa.select((equities_cols.sid,)).where( - equities_cols.symbol == symbol, + (equities_cols.company_symbol == company_symbol) & + (equities_cols.share_class_symbol == share_class_symbol) ).execute().fetchall() if len(sids) == 1: return self._retrieve_equity(sids[0]['sid']) @@ -330,7 +337,7 @@ class AssetFinder(object): )) ) - def lookup_symbol(self, symbol, as_of_date, fuzzy=False): + def lookup_symbol(self, symbol, as_of_date): """ If a fuzzy string is provided, then we try various symbols based on the provided symbol. This is to facilitate mapping from a broker's @@ -341,41 +348,11 @@ class AssetFinder(object): """ symbol = symbol.upper() - ad_value = pd.Timestamp(normalize_date(as_of_date)).value - if not fuzzy: - try: - return self.lookup_symbol_resolve_multiple(symbol, as_of_date) - except SymbolNotFound: - return None - - fuzzy = symbol.replace(self.fuzzy_char, '') - - equities_cols = self.equities.c - candidates = sa.select((equities_cols.sid,)).where( - (equities_cols.fuzzy == fuzzy) & - (equities_cols.start_date <= ad_value) & - (equities_cols.end_date >= ad_value), - ).execute().fetchall() - - # If one SID exists for symbol, return that symbol - if len(candidates) == 1: - return self._retrieve_equity(candidates[0]['sid']) - - # If multiple SIDs exist for symbol, return latest start_date with - # end_date as a tie-breaker - elif candidates: - sid = sa.select((equities_cols.sid,)).where( - (equities_cols.symbol == symbol) & - (equities_cols.start_date <= ad_value), - ).order_by( - equities_cols.start_date.desc(), - equities_cols.end_date.desc(), - ).scalar() - if sid: - return self._retrieve_equity(sid) - - raise SymbolNotFound(symbol=symbol) + try: + return self.lookup_symbol_resolve_multiple(symbol, as_of_date) + except SymbolNotFound: + return None def lookup_future_chain(self, root_symbol, as_of_date, knowledge_date): """ Return the futures chain for a given root symbol.