mirror of
https://github.com/wassname/catalyst.git
synced 2026-07-04 16:12:46 +08:00
ENH: Adds fuzzy symbol look-up and makes it distinct from delimited symbols
This commit is contained in:
+44
-7
@@ -286,7 +286,7 @@ class AssetFinderTestCase(TestCase):
|
||||
def setUp(self):
|
||||
self.env = TradingEnvironment()
|
||||
|
||||
def test_lookup_symbol_fuzzy(self):
|
||||
def test_lookup_symbol_delimited(self):
|
||||
as_of = pd.Timestamp('2013-01-01', tz='UTC')
|
||||
frame = pd.DataFrame.from_records(
|
||||
[
|
||||
@@ -302,23 +302,60 @@ class AssetFinderTestCase(TestCase):
|
||||
]
|
||||
)
|
||||
self.env.write_data(equities_df=frame)
|
||||
finder = AssetFinder(self.env.engine, fuzzy_char='@')
|
||||
finder = AssetFinder(self.env.engine)
|
||||
asset_0, asset_1, asset_2 = (
|
||||
finder.retrieve_asset(i) for i in range(3)
|
||||
)
|
||||
|
||||
# we do it twice to catch caching bugs
|
||||
for i in range(2):
|
||||
# Shouldn't find this with no fuzzy_str passed.
|
||||
self.assertIsNone(finder.lookup_symbol('test', as_of))
|
||||
self.assertIsNone(finder.lookup_symbol('test1', as_of))
|
||||
self.assertEqual(asset_1, finder.lookup_symbol('test.1', as_of))
|
||||
# '@' is not a supported delimiter
|
||||
self.assertIsNone(finder.lookup_symbol('test@1', as_of))
|
||||
|
||||
# Adding an unnecessary fuzzy shouldn't matter.
|
||||
self.assertEqual(asset_1, finder.lookup_symbol('test/1', as_of))
|
||||
for fuzzy_char in ['-', '/', '_', '.']:
|
||||
self.assertEqual(
|
||||
asset_1,
|
||||
finder.lookup_symbol('test%s1' % fuzzy_char, as_of)
|
||||
)
|
||||
|
||||
# Should find exact match.
|
||||
self.assertEqual(asset_1, finder.lookup_symbol('test-1', as_of))
|
||||
def test_lookup_symbol_fuzzy(self):
|
||||
metadata = {
|
||||
0: {'symbol': 'PRTY_HRD'},
|
||||
1: {'symbol': 'BRKA'},
|
||||
2: {'symbol': 'BRK_A'},
|
||||
}
|
||||
self.env.write_data(equities_data=metadata)
|
||||
finder = self.env.asset_finder
|
||||
dt = pd.Timestamp('2013-01-01', tz='UTC')
|
||||
|
||||
# Try combos of looking up PRTYHRD with and without a time or fuzzy
|
||||
# Both non-fuzzys get no result
|
||||
self.assertIsNone(finder.lookup_symbol('PRTYHRD', None))
|
||||
self.assertIsNone(finder.lookup_symbol('PRTYHRD', dt))
|
||||
# Both fuzzys work
|
||||
self.assertEqual(0, finder.lookup_symbol('PRTYHRD', None, fuzzy=True))
|
||||
self.assertEqual(0, finder.lookup_symbol('PRTYHRD', dt, fuzzy=True))
|
||||
|
||||
# Try combos of looking up PRTY_HRD, all returning sid 0
|
||||
self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', None))
|
||||
self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', dt))
|
||||
self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', None, fuzzy=True))
|
||||
self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', dt, fuzzy=True))
|
||||
|
||||
# Try combos of looking up BRKA, all returning sid 1
|
||||
self.assertEqual(1, finder.lookup_symbol('BRKA', None))
|
||||
self.assertEqual(1, finder.lookup_symbol('BRKA', dt))
|
||||
self.assertEqual(1, finder.lookup_symbol('BRKA', None, fuzzy=True))
|
||||
self.assertEqual(1, finder.lookup_symbol('BRKA', dt, fuzzy=True))
|
||||
|
||||
# Try combos of looking up BRK_A, all returning sid 2
|
||||
self.assertEqual(2, finder.lookup_symbol('BRK_A', None))
|
||||
self.assertEqual(2, finder.lookup_symbol('BRK_A', dt))
|
||||
self.assertEqual(2, finder.lookup_symbol('BRK_A', None, fuzzy=True))
|
||||
self.assertEqual(2, finder.lookup_symbol('BRK_A', dt, fuzzy=True))
|
||||
|
||||
def test_lookup_symbol(self):
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ FUTURE_TABLE_FIELDS = ASSET_TABLE_FIELDS | {
|
||||
EQUITY_TABLE_FIELDS = ASSET_TABLE_FIELDS | {
|
||||
'company_symbol',
|
||||
'share_class_symbol',
|
||||
'fuzzy_symbol',
|
||||
}
|
||||
|
||||
EXCHANGE_TABLE_FIELDS = frozenset({
|
||||
@@ -94,31 +95,32 @@ _root_symbols_defaults = {
|
||||
}
|
||||
|
||||
# Fuzzy symbol delimiters that may break up a company symbol and share class
|
||||
_fuzzy_symbol_delimiter_regex = r'[./\-_]'
|
||||
_fuzzy_symbol_default_triggers = frozenset({np.nan, None, ''})
|
||||
_delimited_symbol_delimiter_regex = r'[./\-_]'
|
||||
_delimited_symbol_default_triggers = frozenset({np.nan, None, ''})
|
||||
|
||||
|
||||
def split_fuzzy_symbol(fuzzy_symbol):
|
||||
def split_delimited_symbol(symbol):
|
||||
"""
|
||||
Takes in a symbol that may be fuzzy and splits it in to a company symbol
|
||||
and share class symbol.
|
||||
Takes in a symbol that may be delimited and splits it in to a company
|
||||
symbol and share class symbol. Also returns the fuzzy symbol, which is the
|
||||
symbol without any fuzzy characters at all.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fuzzy_symbol : str
|
||||
The possibly-fuzzy symbol to be split
|
||||
symbol : str
|
||||
The possibly-delimited symbol to be split
|
||||
|
||||
Returns
|
||||
-------
|
||||
( str, str )
|
||||
A tuple of ( company_symbol, share_class_symbol )
|
||||
( str, str , str )
|
||||
A tuple of ( company_symbol, share_class_symbol, fuzzy_symbol)
|
||||
"""
|
||||
# return blank strings for any bad fuzzy symbols, like NaN or None
|
||||
if fuzzy_symbol in _fuzzy_symbol_default_triggers:
|
||||
return ('', '')
|
||||
if symbol in _delimited_symbol_default_triggers:
|
||||
return ('', '', '')
|
||||
|
||||
split_list = re.split(pattern=_fuzzy_symbol_delimiter_regex,
|
||||
string=fuzzy_symbol,
|
||||
split_list = re.split(pattern=_delimited_symbol_delimiter_regex,
|
||||
string=symbol,
|
||||
maxsplit=1)
|
||||
|
||||
# Break the list up in to its two components, the company symbol and the
|
||||
@@ -128,7 +130,13 @@ def split_fuzzy_symbol(fuzzy_symbol):
|
||||
share_class_symbol = split_list[1]
|
||||
else:
|
||||
share_class_symbol = ''
|
||||
return (company_symbol, share_class_symbol)
|
||||
|
||||
# Strip all fuzzy characters from the symbol to get the fuzzy symbol
|
||||
fuzzy_symbol = re.sub(pattern=_delimited_symbol_delimiter_regex,
|
||||
repl='',
|
||||
string=symbol)
|
||||
|
||||
return (company_symbol, share_class_symbol, fuzzy_symbol)
|
||||
|
||||
|
||||
def _generate_output_dataframe(data_subset, defaults):
|
||||
@@ -194,8 +202,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
|
||||
|
||||
Methods
|
||||
-------
|
||||
write_all(engine, fuzzy_char=None, allow_sid_assignment=True,
|
||||
constraints=False)
|
||||
write_all(engine, allow_sid_assignment=True, constraints=False)
|
||||
Write the data supplied at initialization to the database.
|
||||
init_db(engine, constraints=False)
|
||||
Create the SQLite tables (called by write_all).
|
||||
@@ -295,6 +302,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
|
||||
sa.Column('symbol', sa.Text),
|
||||
sa.Column('company_symbol', sa.Text),
|
||||
sa.Column('share_class_symbol', sa.Text),
|
||||
sa.Column('fuzzy_symbol', sa.Text),
|
||||
sa.Column('asset_name', sa.Text),
|
||||
sa.Column('start_date', sa.Integer, default=0),
|
||||
sa.Column('end_date', sa.Integer),
|
||||
@@ -404,10 +412,10 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
|
||||
)
|
||||
|
||||
# Split symbols to company_symbols and share_class_symbols
|
||||
tuple_series = equities_output['symbol'].apply(split_fuzzy_symbol)
|
||||
tuple_series = equities_output['symbol'].apply(split_delimited_symbol)
|
||||
split_symbols = pd.DataFrame(
|
||||
tuple_series.tolist(),
|
||||
columns=['company_symbol', 'share_class_symbol'],
|
||||
columns=['company_symbol', 'share_class_symbol', 'fuzzy_symbol'],
|
||||
index=tuple_series.index
|
||||
)
|
||||
equities_output = equities_output.join(split_symbols)
|
||||
@@ -419,6 +427,8 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
|
||||
equities_output.company_symbol.str.upper()
|
||||
equities_output['share_class_symbol'] = \
|
||||
equities_output.share_class_symbol.str.upper()
|
||||
equities_output['fuzzy_symbol'] = \
|
||||
equities_output.fuzzy_symbol.str.upper()
|
||||
|
||||
# Convert date columns to UNIX Epoch integers (nanoseconds)
|
||||
equities_output['start_date'] = \
|
||||
|
||||
@@ -39,7 +39,7 @@ from zipline.assets import (
|
||||
from zipline.assets.asset_writer import (
|
||||
FUTURE_TABLE_FIELDS,
|
||||
EQUITY_TABLE_FIELDS,
|
||||
split_fuzzy_symbol,
|
||||
split_delimited_symbol,
|
||||
)
|
||||
|
||||
log = Logger('assets.py')
|
||||
@@ -88,9 +88,8 @@ class AssetFinder(object):
|
||||
# reference to an AssetFinder
|
||||
PERSISTENT_TOKEN = "<AssetFinder>"
|
||||
|
||||
def __init__(self, engine, allow_sid_assignment=True, fuzzy_char=None):
|
||||
def __init__(self, engine, allow_sid_assignment=True):
|
||||
|
||||
self.fuzzy_char = fuzzy_char
|
||||
self.allow_sid_assignment = allow_sid_assignment
|
||||
|
||||
self.engine = engine
|
||||
@@ -262,14 +261,15 @@ class AssetFinder(object):
|
||||
self._future_cache[sid] = future
|
||||
return future
|
||||
|
||||
def lookup_symbol(self, symbol, as_of_date, default_None=True):
|
||||
def lookup_symbol(self, symbol, as_of_date, default_None=True,
|
||||
fuzzy=False):
|
||||
"""
|
||||
Return matching Asset of name symbol in database.
|
||||
Return matching Equity of name symbol in database.
|
||||
|
||||
If multiple Assets are found and as_of_date is not set,
|
||||
If multiple Equities are found and as_of_date is not set,
|
||||
raises MultipleSymbolsFound.
|
||||
|
||||
If no Asset was active at as_of_date raises SymbolNotFound, or None
|
||||
If no Equity was active at as_of_date raises SymbolNotFound, or None
|
||||
if default_None is true.
|
||||
"""
|
||||
|
||||
@@ -278,19 +278,35 @@ class AssetFinder(object):
|
||||
if as_of_date is not None:
|
||||
as_of_date = pd.Timestamp(normalize_date(as_of_date))
|
||||
|
||||
company_symbol, share_class_symbol = split_fuzzy_symbol(symbol)
|
||||
company_symbol, share_class_symbol, fuzzy_symbol = \
|
||||
split_delimited_symbol(symbol)
|
||||
|
||||
equities_cols = self.equities.c
|
||||
if as_of_date:
|
||||
ad_value = as_of_date.value
|
||||
|
||||
# If one SID exists for symbol, return that symbol
|
||||
if fuzzy:
|
||||
# Search for a single exact match on the fuzzy column
|
||||
fuzzy_candidates = sa.select((equities_cols.sid,)).where(
|
||||
(equities_cols.fuzzy_symbol == fuzzy_symbol) &
|
||||
(equities_cols.start_date <= ad_value) &
|
||||
(equities_cols.end_date >= ad_value),
|
||||
).execute().fetchall()
|
||||
|
||||
# If exactly one SID exists for fuzzy_symbol, return that sid
|
||||
if len(fuzzy_candidates) == 1:
|
||||
return self._retrieve_equity(fuzzy_candidates[0]['sid'])
|
||||
|
||||
# Search for exact matches of the split-up company_symbol and
|
||||
# share_class_symbol
|
||||
candidates = sa.select((equities_cols.sid,)).where(
|
||||
(equities_cols.company_symbol == company_symbol) &
|
||||
(equities_cols.share_class_symbol == share_class_symbol) &
|
||||
(equities_cols.start_date <= ad_value) &
|
||||
(equities_cols.end_date >= ad_value),
|
||||
).execute().fetchall()
|
||||
|
||||
# If exactly one SID exists for symbol, return that symbol
|
||||
if len(candidates) == 1:
|
||||
return self._retrieve_equity(candidates[0]['sid'])
|
||||
|
||||
@@ -327,6 +343,15 @@ class AssetFinder(object):
|
||||
raise SymbolNotFound(symbol=symbol)
|
||||
|
||||
else:
|
||||
# If this is a fuzzy look-up, check if there is exactly one match
|
||||
# for the fuzzy symbol
|
||||
if fuzzy:
|
||||
fuzzy_sids = sa.select((equities_cols.sid,)).where(
|
||||
(equities_cols.fuzzy_symbol == fuzzy_symbol)
|
||||
).execute().fetchall()
|
||||
if len(fuzzy_sids) == 1:
|
||||
return self._retrieve_equity(fuzzy_sids[0]['sid'])
|
||||
|
||||
sids = sa.select((equities_cols.sid,)).where(
|
||||
(equities_cols.company_symbol == company_symbol) &
|
||||
(equities_cols.share_class_symbol == share_class_symbol)
|
||||
|
||||
Reference in New Issue
Block a user