From c446e7f62accb81ca3763fbf9733798b04880a6a Mon Sep 17 00:00:00 2001 From: jfkirk Date: Wed, 16 Sep 2015 14:34:18 -0400 Subject: [PATCH] ENH: Adds fuzzy symbol look-up and makes it distinct from delimited symbols --- tests/test_assets.py | 51 +++++++++++++++++++++++++++++----- zipline/assets/asset_writer.py | 46 ++++++++++++++++++------------ zipline/assets/assets.py | 43 ++++++++++++++++++++++------ 3 files changed, 106 insertions(+), 34 deletions(-) diff --git a/tests/test_assets.py b/tests/test_assets.py index 6127ca43..3915c4ee 100644 --- a/tests/test_assets.py +++ b/tests/test_assets.py @@ -286,7 +286,7 @@ class AssetFinderTestCase(TestCase): def setUp(self): self.env = TradingEnvironment() - def test_lookup_symbol_fuzzy(self): + def test_lookup_symbol_delimited(self): as_of = pd.Timestamp('2013-01-01', tz='UTC') frame = pd.DataFrame.from_records( [ @@ -302,23 +302,60 @@ class AssetFinderTestCase(TestCase): ] ) self.env.write_data(equities_df=frame) - finder = AssetFinder(self.env.engine, fuzzy_char='@') + finder = AssetFinder(self.env.engine) asset_0, asset_1, asset_2 = ( finder.retrieve_asset(i) for i in range(3) ) # we do it twice to catch caching bugs for i in range(2): - # Shouldn't find this with no fuzzy_str passed. self.assertIsNone(finder.lookup_symbol('test', as_of)) self.assertIsNone(finder.lookup_symbol('test1', as_of)) - self.assertEqual(asset_1, finder.lookup_symbol('test.1', as_of)) + # '@' is not a supported delimiter + self.assertIsNone(finder.lookup_symbol('test@1', as_of)) # Adding an unnecessary fuzzy shouldn't matter. - self.assertEqual(asset_1, finder.lookup_symbol('test/1', as_of)) + for fuzzy_char in ['-', '/', '_', '.']: + self.assertEqual( + asset_1, + finder.lookup_symbol('test%s1' % fuzzy_char, as_of) + ) - # Should find exact match. - self.assertEqual(asset_1, finder.lookup_symbol('test-1', as_of)) + def test_lookup_symbol_fuzzy(self): + metadata = { + 0: {'symbol': 'PRTY_HRD'}, + 1: {'symbol': 'BRKA'}, + 2: {'symbol': 'BRK_A'}, + } + self.env.write_data(equities_data=metadata) + finder = self.env.asset_finder + dt = pd.Timestamp('2013-01-01', tz='UTC') + + # Try combos of looking up PRTYHRD with and without a time or fuzzy + # Both non-fuzzys get no result + self.assertIsNone(finder.lookup_symbol('PRTYHRD', None)) + self.assertIsNone(finder.lookup_symbol('PRTYHRD', dt)) + # Both fuzzys work + self.assertEqual(0, finder.lookup_symbol('PRTYHRD', None, fuzzy=True)) + self.assertEqual(0, finder.lookup_symbol('PRTYHRD', dt, fuzzy=True)) + + # Try combos of looking up PRTY_HRD, all returning sid 0 + self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', None)) + self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', dt)) + self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', None, fuzzy=True)) + self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', dt, fuzzy=True)) + + # Try combos of looking up BRKA, all returning sid 1 + self.assertEqual(1, finder.lookup_symbol('BRKA', None)) + self.assertEqual(1, finder.lookup_symbol('BRKA', dt)) + self.assertEqual(1, finder.lookup_symbol('BRKA', None, fuzzy=True)) + self.assertEqual(1, finder.lookup_symbol('BRKA', dt, fuzzy=True)) + + # Try combos of looking up BRK_A, all returning sid 2 + self.assertEqual(2, finder.lookup_symbol('BRK_A', None)) + self.assertEqual(2, finder.lookup_symbol('BRK_A', dt)) + self.assertEqual(2, finder.lookup_symbol('BRK_A', None, fuzzy=True)) + self.assertEqual(2, finder.lookup_symbol('BRK_A', dt, fuzzy=True)) def test_lookup_symbol(self): diff --git a/zipline/assets/asset_writer.py b/zipline/assets/asset_writer.py index 71c6a35f..309bd110 100644 --- a/zipline/assets/asset_writer.py +++ b/zipline/assets/asset_writer.py @@ -39,6 +39,7 @@ FUTURE_TABLE_FIELDS = ASSET_TABLE_FIELDS | { EQUITY_TABLE_FIELDS = ASSET_TABLE_FIELDS | { 'company_symbol', 'share_class_symbol', + 'fuzzy_symbol', } EXCHANGE_TABLE_FIELDS = frozenset({ @@ -94,31 +95,32 @@ _root_symbols_defaults = { } # Fuzzy symbol delimiters that may break up a company symbol and share class -_fuzzy_symbol_delimiter_regex = r'[./\-_]' -_fuzzy_symbol_default_triggers = frozenset({np.nan, None, ''}) +_delimited_symbol_delimiter_regex = r'[./\-_]' +_delimited_symbol_default_triggers = frozenset({np.nan, None, ''}) -def split_fuzzy_symbol(fuzzy_symbol): +def split_delimited_symbol(symbol): """ - Takes in a symbol that may be fuzzy and splits it in to a company symbol - and share class symbol. + Takes in a symbol that may be delimited and splits it in to a company + symbol and share class symbol. Also returns the fuzzy symbol, which is the + symbol without any fuzzy characters at all. Parameters ---------- - fuzzy_symbol : str - The possibly-fuzzy symbol to be split + symbol : str + The possibly-delimited symbol to be split Returns ------- - ( str, str ) - A tuple of ( company_symbol, share_class_symbol ) + ( str, str , str ) + A tuple of ( company_symbol, share_class_symbol, fuzzy_symbol) """ # return blank strings for any bad fuzzy symbols, like NaN or None - if fuzzy_symbol in _fuzzy_symbol_default_triggers: - return ('', '') + if symbol in _delimited_symbol_default_triggers: + return ('', '', '') - split_list = re.split(pattern=_fuzzy_symbol_delimiter_regex, - string=fuzzy_symbol, + split_list = re.split(pattern=_delimited_symbol_delimiter_regex, + string=symbol, maxsplit=1) # Break the list up in to its two components, the company symbol and the @@ -128,7 +130,13 @@ def split_fuzzy_symbol(fuzzy_symbol): share_class_symbol = split_list[1] else: share_class_symbol = '' - return (company_symbol, share_class_symbol) + + # Strip all fuzzy characters from the symbol to get the fuzzy symbol + fuzzy_symbol = re.sub(pattern=_delimited_symbol_delimiter_regex, + repl='', + string=symbol) + + return (company_symbol, share_class_symbol, fuzzy_symbol) def _generate_output_dataframe(data_subset, defaults): @@ -194,8 +202,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)): Methods ------- - write_all(engine, fuzzy_char=None, allow_sid_assignment=True, - constraints=False) + write_all(engine, allow_sid_assignment=True, constraints=False) Write the data supplied at initialization to the database. init_db(engine, constraints=False) Create the SQLite tables (called by write_all). @@ -295,6 +302,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)): sa.Column('symbol', sa.Text), sa.Column('company_symbol', sa.Text), sa.Column('share_class_symbol', sa.Text), + sa.Column('fuzzy_symbol', sa.Text), sa.Column('asset_name', sa.Text), sa.Column('start_date', sa.Integer, default=0), sa.Column('end_date', sa.Integer), @@ -404,10 +412,10 @@ class AssetDBWriter(with_metaclass(ABCMeta)): ) # Split symbols to company_symbols and share_class_symbols - tuple_series = equities_output['symbol'].apply(split_fuzzy_symbol) + tuple_series = equities_output['symbol'].apply(split_delimited_symbol) split_symbols = pd.DataFrame( tuple_series.tolist(), - columns=['company_symbol', 'share_class_symbol'], + columns=['company_symbol', 'share_class_symbol', 'fuzzy_symbol'], index=tuple_series.index ) equities_output = equities_output.join(split_symbols) @@ -419,6 +427,8 @@ class AssetDBWriter(with_metaclass(ABCMeta)): equities_output.company_symbol.str.upper() equities_output['share_class_symbol'] = \ equities_output.share_class_symbol.str.upper() + equities_output['fuzzy_symbol'] = \ + equities_output.fuzzy_symbol.str.upper() # Convert date columns to UNIX Epoch integers (nanoseconds) equities_output['start_date'] = \ diff --git a/zipline/assets/assets.py b/zipline/assets/assets.py index b44d56b1..097d12bd 100644 --- a/zipline/assets/assets.py +++ b/zipline/assets/assets.py @@ -39,7 +39,7 @@ from zipline.assets import ( from zipline.assets.asset_writer import ( FUTURE_TABLE_FIELDS, EQUITY_TABLE_FIELDS, - split_fuzzy_symbol, + split_delimited_symbol, ) log = Logger('assets.py') @@ -88,9 +88,8 @@ class AssetFinder(object): # reference to an AssetFinder PERSISTENT_TOKEN = "" - def __init__(self, engine, allow_sid_assignment=True, fuzzy_char=None): + def __init__(self, engine, allow_sid_assignment=True): - self.fuzzy_char = fuzzy_char self.allow_sid_assignment = allow_sid_assignment self.engine = engine @@ -262,14 +261,15 @@ class AssetFinder(object): self._future_cache[sid] = future return future - def lookup_symbol(self, symbol, as_of_date, default_None=True): + def lookup_symbol(self, symbol, as_of_date, default_None=True, + fuzzy=False): """ - Return matching Asset of name symbol in database. + Return matching Equity of name symbol in database. - If multiple Assets are found and as_of_date is not set, + If multiple Equities are found and as_of_date is not set, raises MultipleSymbolsFound. - If no Asset was active at as_of_date raises SymbolNotFound, or None + If no Equity was active at as_of_date raises SymbolNotFound, or None if default_None is true. """ @@ -278,19 +278,35 @@ class AssetFinder(object): if as_of_date is not None: as_of_date = pd.Timestamp(normalize_date(as_of_date)) - company_symbol, share_class_symbol = split_fuzzy_symbol(symbol) + company_symbol, share_class_symbol, fuzzy_symbol = \ + split_delimited_symbol(symbol) equities_cols = self.equities.c if as_of_date: ad_value = as_of_date.value - # If one SID exists for symbol, return that symbol + if fuzzy: + # Search for a single exact match on the fuzzy column + fuzzy_candidates = sa.select((equities_cols.sid,)).where( + (equities_cols.fuzzy_symbol == fuzzy_symbol) & + (equities_cols.start_date <= ad_value) & + (equities_cols.end_date >= ad_value), + ).execute().fetchall() + + # If exactly one SID exists for fuzzy_symbol, return that sid + if len(fuzzy_candidates) == 1: + return self._retrieve_equity(fuzzy_candidates[0]['sid']) + + # Search for exact matches of the split-up company_symbol and + # share_class_symbol candidates = sa.select((equities_cols.sid,)).where( (equities_cols.company_symbol == company_symbol) & (equities_cols.share_class_symbol == share_class_symbol) & (equities_cols.start_date <= ad_value) & (equities_cols.end_date >= ad_value), ).execute().fetchall() + + # If exactly one SID exists for symbol, return that symbol if len(candidates) == 1: return self._retrieve_equity(candidates[0]['sid']) @@ -327,6 +343,15 @@ class AssetFinder(object): raise SymbolNotFound(symbol=symbol) else: + # If this is a fuzzy look-up, check if there is exactly one match + # for the fuzzy symbol + if fuzzy: + fuzzy_sids = sa.select((equities_cols.sid,)).where( + (equities_cols.fuzzy_symbol == fuzzy_symbol) + ).execute().fetchall() + if len(fuzzy_sids) == 1: + return self._retrieve_equity(fuzzy_sids[0]['sid']) + sids = sa.select((equities_cols.sid,)).where( (equities_cols.company_symbol == company_symbol) & (equities_cols.share_class_symbol == share_class_symbol)