ENH: Adds fuzzy symbol look-up and makes it distinct from delimited symbols

This commit is contained in:
jfkirk
2015-09-16 14:34:18 -04:00
parent d84bdefef8
commit c446e7f62a
3 changed files with 106 additions and 34 deletions
+44 -7
View File
@@ -286,7 +286,7 @@ class AssetFinderTestCase(TestCase):
def setUp(self):
self.env = TradingEnvironment()
def test_lookup_symbol_fuzzy(self):
def test_lookup_symbol_delimited(self):
as_of = pd.Timestamp('2013-01-01', tz='UTC')
frame = pd.DataFrame.from_records(
[
@@ -302,23 +302,60 @@ class AssetFinderTestCase(TestCase):
]
)
self.env.write_data(equities_df=frame)
finder = AssetFinder(self.env.engine, fuzzy_char='@')
finder = AssetFinder(self.env.engine)
asset_0, asset_1, asset_2 = (
finder.retrieve_asset(i) for i in range(3)
)
# we do it twice to catch caching bugs
for i in range(2):
# Shouldn't find this with no fuzzy_str passed.
self.assertIsNone(finder.lookup_symbol('test', as_of))
self.assertIsNone(finder.lookup_symbol('test1', as_of))
self.assertEqual(asset_1, finder.lookup_symbol('test.1', as_of))
# '@' is not a supported delimiter
self.assertIsNone(finder.lookup_symbol('test@1', as_of))
# Adding an unnecessary fuzzy shouldn't matter.
self.assertEqual(asset_1, finder.lookup_symbol('test/1', as_of))
for fuzzy_char in ['-', '/', '_', '.']:
self.assertEqual(
asset_1,
finder.lookup_symbol('test%s1' % fuzzy_char, as_of)
)
# Should find exact match.
self.assertEqual(asset_1, finder.lookup_symbol('test-1', as_of))
def test_lookup_symbol_fuzzy(self):
metadata = {
0: {'symbol': 'PRTY_HRD'},
1: {'symbol': 'BRKA'},
2: {'symbol': 'BRK_A'},
}
self.env.write_data(equities_data=metadata)
finder = self.env.asset_finder
dt = pd.Timestamp('2013-01-01', tz='UTC')
# Try combos of looking up PRTYHRD with and without a time or fuzzy
# Both non-fuzzys get no result
self.assertIsNone(finder.lookup_symbol('PRTYHRD', None))
self.assertIsNone(finder.lookup_symbol('PRTYHRD', dt))
# Both fuzzys work
self.assertEqual(0, finder.lookup_symbol('PRTYHRD', None, fuzzy=True))
self.assertEqual(0, finder.lookup_symbol('PRTYHRD', dt, fuzzy=True))
# Try combos of looking up PRTY_HRD, all returning sid 0
self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', None))
self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', dt))
self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', None, fuzzy=True))
self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', dt, fuzzy=True))
# Try combos of looking up BRKA, all returning sid 1
self.assertEqual(1, finder.lookup_symbol('BRKA', None))
self.assertEqual(1, finder.lookup_symbol('BRKA', dt))
self.assertEqual(1, finder.lookup_symbol('BRKA', None, fuzzy=True))
self.assertEqual(1, finder.lookup_symbol('BRKA', dt, fuzzy=True))
# Try combos of looking up BRK_A, all returning sid 2
self.assertEqual(2, finder.lookup_symbol('BRK_A', None))
self.assertEqual(2, finder.lookup_symbol('BRK_A', dt))
self.assertEqual(2, finder.lookup_symbol('BRK_A', None, fuzzy=True))
self.assertEqual(2, finder.lookup_symbol('BRK_A', dt, fuzzy=True))
def test_lookup_symbol(self):
+28 -18
View File
@@ -39,6 +39,7 @@ FUTURE_TABLE_FIELDS = ASSET_TABLE_FIELDS | {
EQUITY_TABLE_FIELDS = ASSET_TABLE_FIELDS | {
'company_symbol',
'share_class_symbol',
'fuzzy_symbol',
}
EXCHANGE_TABLE_FIELDS = frozenset({
@@ -94,31 +95,32 @@ _root_symbols_defaults = {
}
# Fuzzy symbol delimiters that may break up a company symbol and share class
_fuzzy_symbol_delimiter_regex = r'[./\-_]'
_fuzzy_symbol_default_triggers = frozenset({np.nan, None, ''})
_delimited_symbol_delimiter_regex = r'[./\-_]'
_delimited_symbol_default_triggers = frozenset({np.nan, None, ''})
def split_fuzzy_symbol(fuzzy_symbol):
def split_delimited_symbol(symbol):
"""
Takes in a symbol that may be fuzzy and splits it in to a company symbol
and share class symbol.
Takes in a symbol that may be delimited and splits it in to a company
symbol and share class symbol. Also returns the fuzzy symbol, which is the
symbol without any fuzzy characters at all.
Parameters
----------
fuzzy_symbol : str
The possibly-fuzzy symbol to be split
symbol : str
The possibly-delimited symbol to be split
Returns
-------
( str, str )
A tuple of ( company_symbol, share_class_symbol )
( str, str , str )
A tuple of ( company_symbol, share_class_symbol, fuzzy_symbol)
"""
# return blank strings for any bad fuzzy symbols, like NaN or None
if fuzzy_symbol in _fuzzy_symbol_default_triggers:
return ('', '')
if symbol in _delimited_symbol_default_triggers:
return ('', '', '')
split_list = re.split(pattern=_fuzzy_symbol_delimiter_regex,
string=fuzzy_symbol,
split_list = re.split(pattern=_delimited_symbol_delimiter_regex,
string=symbol,
maxsplit=1)
# Break the list up in to its two components, the company symbol and the
@@ -128,7 +130,13 @@ def split_fuzzy_symbol(fuzzy_symbol):
share_class_symbol = split_list[1]
else:
share_class_symbol = ''
return (company_symbol, share_class_symbol)
# Strip all fuzzy characters from the symbol to get the fuzzy symbol
fuzzy_symbol = re.sub(pattern=_delimited_symbol_delimiter_regex,
repl='',
string=symbol)
return (company_symbol, share_class_symbol, fuzzy_symbol)
def _generate_output_dataframe(data_subset, defaults):
@@ -194,8 +202,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
Methods
-------
write_all(engine, fuzzy_char=None, allow_sid_assignment=True,
constraints=False)
write_all(engine, allow_sid_assignment=True, constraints=False)
Write the data supplied at initialization to the database.
init_db(engine, constraints=False)
Create the SQLite tables (called by write_all).
@@ -295,6 +302,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
sa.Column('symbol', sa.Text),
sa.Column('company_symbol', sa.Text),
sa.Column('share_class_symbol', sa.Text),
sa.Column('fuzzy_symbol', sa.Text),
sa.Column('asset_name', sa.Text),
sa.Column('start_date', sa.Integer, default=0),
sa.Column('end_date', sa.Integer),
@@ -404,10 +412,10 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
)
# Split symbols to company_symbols and share_class_symbols
tuple_series = equities_output['symbol'].apply(split_fuzzy_symbol)
tuple_series = equities_output['symbol'].apply(split_delimited_symbol)
split_symbols = pd.DataFrame(
tuple_series.tolist(),
columns=['company_symbol', 'share_class_symbol'],
columns=['company_symbol', 'share_class_symbol', 'fuzzy_symbol'],
index=tuple_series.index
)
equities_output = equities_output.join(split_symbols)
@@ -419,6 +427,8 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
equities_output.company_symbol.str.upper()
equities_output['share_class_symbol'] = \
equities_output.share_class_symbol.str.upper()
equities_output['fuzzy_symbol'] = \
equities_output.fuzzy_symbol.str.upper()
# Convert date columns to UNIX Epoch integers (nanoseconds)
equities_output['start_date'] = \
+34 -9
View File
@@ -39,7 +39,7 @@ from zipline.assets import (
from zipline.assets.asset_writer import (
FUTURE_TABLE_FIELDS,
EQUITY_TABLE_FIELDS,
split_fuzzy_symbol,
split_delimited_symbol,
)
log = Logger('assets.py')
@@ -88,9 +88,8 @@ class AssetFinder(object):
# reference to an AssetFinder
PERSISTENT_TOKEN = "<AssetFinder>"
def __init__(self, engine, allow_sid_assignment=True, fuzzy_char=None):
def __init__(self, engine, allow_sid_assignment=True):
self.fuzzy_char = fuzzy_char
self.allow_sid_assignment = allow_sid_assignment
self.engine = engine
@@ -262,14 +261,15 @@ class AssetFinder(object):
self._future_cache[sid] = future
return future
def lookup_symbol(self, symbol, as_of_date, default_None=True):
def lookup_symbol(self, symbol, as_of_date, default_None=True,
fuzzy=False):
"""
Return matching Asset of name symbol in database.
Return matching Equity of name symbol in database.
If multiple Assets are found and as_of_date is not set,
If multiple Equities are found and as_of_date is not set,
raises MultipleSymbolsFound.
If no Asset was active at as_of_date raises SymbolNotFound, or None
If no Equity was active at as_of_date raises SymbolNotFound, or None
if default_None is true.
"""
@@ -278,19 +278,35 @@ class AssetFinder(object):
if as_of_date is not None:
as_of_date = pd.Timestamp(normalize_date(as_of_date))
company_symbol, share_class_symbol = split_fuzzy_symbol(symbol)
company_symbol, share_class_symbol, fuzzy_symbol = \
split_delimited_symbol(symbol)
equities_cols = self.equities.c
if as_of_date:
ad_value = as_of_date.value
# If one SID exists for symbol, return that symbol
if fuzzy:
# Search for a single exact match on the fuzzy column
fuzzy_candidates = sa.select((equities_cols.sid,)).where(
(equities_cols.fuzzy_symbol == fuzzy_symbol) &
(equities_cols.start_date <= ad_value) &
(equities_cols.end_date >= ad_value),
).execute().fetchall()
# If exactly one SID exists for fuzzy_symbol, return that sid
if len(fuzzy_candidates) == 1:
return self._retrieve_equity(fuzzy_candidates[0]['sid'])
# Search for exact matches of the split-up company_symbol and
# share_class_symbol
candidates = sa.select((equities_cols.sid,)).where(
(equities_cols.company_symbol == company_symbol) &
(equities_cols.share_class_symbol == share_class_symbol) &
(equities_cols.start_date <= ad_value) &
(equities_cols.end_date >= ad_value),
).execute().fetchall()
# If exactly one SID exists for symbol, return that symbol
if len(candidates) == 1:
return self._retrieve_equity(candidates[0]['sid'])
@@ -327,6 +343,15 @@ class AssetFinder(object):
raise SymbolNotFound(symbol=symbol)
else:
# If this is a fuzzy look-up, check if there is exactly one match
# for the fuzzy symbol
if fuzzy:
fuzzy_sids = sa.select((equities_cols.sid,)).where(
(equities_cols.fuzzy_symbol == fuzzy_symbol)
).execute().fetchall()
if len(fuzzy_sids) == 1:
return self._retrieve_equity(fuzzy_sids[0]['sid'])
sids = sa.select((equities_cols.sid,)).where(
(equities_cols.company_symbol == company_symbol) &
(equities_cols.share_class_symbol == share_class_symbol)