ENH: Simplifies AssetFinder symbol lookup by making fuzzy lookup the default

This commit is contained in:
jfkirk
2015-09-15 16:52:06 -04:00
parent 26fd6fda8b
commit 29dce965d5
3 changed files with 81 additions and 58 deletions
+9 -9
View File
@@ -292,12 +292,11 @@ class AssetFinderTestCase(TestCase):
[
{
'sid': i,
'symbol': 'TEST@%d' % i,
'symbol': 'TEST.%d' % i,
'company_name': "company%d" % i,
'start_date': as_of.value,
'end_date': as_of.value,
'exchange': uuid.uuid4().hex,
'fuzzy': 'TEST%d' % i
'exchange': uuid.uuid4().hex
}
for i in range(3)
]
@@ -308,25 +307,26 @@ class AssetFinderTestCase(TestCase):
finder.retrieve_asset(i) for i in range(3)
)
for i in range(2): # we do it twice to test for caching bugs
# we do it twice to catch caching bugs
for i in range(2):
# Shouldn't find this with no fuzzy_str passed.
self.assertIsNone(finder.lookup_symbol('test', as_of))
self.assertIsNone(finder.lookup_symbol('test1', as_of))
self.assertEqual(
asset_1,
finder.lookup_symbol('test@1', as_of)
finder.lookup_symbol('test.1', as_of)
)
# Adding an unnecessary fuzzy shouldn't matter.
self.assertEqual(
asset_1,
finder.lookup_symbol('test@1', as_of, fuzzy=True)
finder.lookup_symbol('test/1', as_of)
)
# Shouldn't find this with no fuzzy_str passed.
self.assertIsNone(finder.lookup_symbol('test1', as_of))
# Should find exact match.
self.assertEqual(
asset_1,
finder.lookup_symbol('test1', as_of, fuzzy=True),
finder.lookup_symbol('test-1', as_of),
)
def test_lookup_symbol_resolve_multiple(self):
+56 -10
View File
@@ -4,7 +4,9 @@ from abc import (
)
from collections import namedtuple
import re
import pandas as pd
import numpy as np
from six import with_metaclass
import sqlalchemy as sa
@@ -34,7 +36,10 @@ FUTURE_TABLE_FIELDS = ASSET_TABLE_FIELDS | {
}
# Expected fields for an Equity's metadata
EQUITY_TABLE_FIELDS = ASSET_TABLE_FIELDS
EQUITY_TABLE_FIELDS = ASSET_TABLE_FIELDS | {
'company_symbol',
'share_class_symbol',
}
EXCHANGE_TABLE_FIELDS = frozenset({
'exchange',
@@ -88,6 +93,43 @@ _root_symbols_defaults = {
'exchange': None,
}
# Fuzzy symbol delimiters that may break up a company symbol and share class
_fuzzy_symbol_delimiter_regex = r'[./\-_]'
_fuzzy_symbol_default_triggers = frozenset({np.nan, None, ''})
def split_fuzzy_symbol(fuzzy_symbol):
"""
Takes in a symbol that may be fuzzy and splits it in to a company symbol
and share class symbol.
Parameters
----------
fuzzy_symbol : str
The possibly-fuzzy symbol to be split
Returns
-------
( str, str )
A tuple of ( company_symbol, share_class_symbol )
"""
# return blank strings for any bad fuzzy symbols, like NaN or None
if fuzzy_symbol in _fuzzy_symbol_default_triggers:
return ('', '')
split_list = re.split(pattern=_fuzzy_symbol_delimiter_regex,
string=fuzzy_symbol,
maxsplit=1)
# Break the list up in to its two components, the company symbol and the
# share class symbol
company_symbol = split_list[0]
if len(split_list) > 1:
share_class_symbol = split_list[1]
else:
share_class_symbol = ''
return (company_symbol, share_class_symbol)
def _generate_output_dataframe(data_subset, defaults):
"""
@@ -163,7 +205,6 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
"""
def write_all(self,
engine,
fuzzy_char=None,
allow_sid_assignment=True,
constraints=True):
""" Write pre-supplied data to SQLite.
@@ -172,8 +213,6 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
----------
engine : Engine
An SQLAlchemy engine to a SQL database.
fuzzy_char : str, optional
A string for use in fuzzy matching.
allow_sid_assignment: bool, optional
If True then the class can assign sids where necessary.
constraints : bool, optional
@@ -192,7 +231,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
self._write_exchanges(data.exchanges, txn)
self._write_root_symbols(data.root_symbols, txn)
self._write_futures(data.futures, txn)
self._write_equities(data.equities, fuzzy_char, txn)
self._write_equities(data.equities, txn)
def _write_exchanges(self, exchanges, bind=None):
recs = exchanges.reset_index().rename_axis(
@@ -222,11 +261,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
self.asset_router.insert().values([(record['sid'], 'future')])\
.execute(bind=bind)
def _write_equities(self, equities, fuzzy_char, bind=None):
# Apply fuzzy matching.
if fuzzy_char:
equities['fuzzy'] = equities['symbol'].str.replace(fuzzy_char, '')
def _write_equities(self, equities, bind=None):
recs = equities.reset_index().rename_axis(
{'index': 'sid'},
1,
@@ -258,6 +293,8 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
primary_key=constraints,
),
sa.Column('symbol', sa.Text),
sa.Column('company_symbol', sa.Text),
sa.Column('share_class_symbol', sa.Text),
sa.Column('asset_name', sa.Text),
sa.Column('start_date', sa.Integer, default=0),
sa.Column('end_date', sa.Integer),
@@ -366,6 +403,15 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
defaults=_equities_defaults,
)
# Split symbols to company_symbols and share_class_symbols
tuple_series = equities_output['symbol'].apply(split_fuzzy_symbol)
split_symbols = pd.DataFrame(
tuple_series.tolist(),
columns=['company_symbol', 'share_class_symbol'],
index=tuple_series.index
)
equities_output = equities_output.join(split_symbols)
# Convert date columns to UNIX Epoch integers (nanoseconds)
equities_output['start_date'] = \
equities_output['start_date'].apply(self.convert_datetime)
+16 -39
View File
@@ -39,6 +39,7 @@ from zipline.assets import (
from zipline.assets.asset_writer import (
FUTURE_TABLE_FIELDS,
EQUITY_TABLE_FIELDS,
split_fuzzy_symbol,
)
log = Logger('assets.py')
@@ -273,13 +274,16 @@ class AssetFinder(object):
if as_of_date is not None:
as_of_date = pd.Timestamp(normalize_date(as_of_date))
company_symbol, share_class_symbol = split_fuzzy_symbol(symbol)
equities_cols = self.equities.c
if as_of_date:
ad_value = as_of_date.value
# If one SID exists for symbol, return that symbol
candidates = sa.select((equities_cols.sid,)).where(
(equities_cols.symbol == symbol) &
(equities_cols.company_symbol == company_symbol) &
(equities_cols.share_class_symbol == share_class_symbol) &
(equities_cols.start_date <= ad_value) &
(equities_cols.end_date >= ad_value),
).execute().fetchall()
@@ -290,7 +294,8 @@ class AssetFinder(object):
# highest-but-not-over end_date
elif not candidates:
sid = sa.select((equities_cols.sid,)).where(
(equities_cols.symbol == symbol) &
(equities_cols.company_symbol == company_symbol) &
(equities_cols.share_class_symbol == share_class_symbol) &
(equities_cols.start_date <= ad_value),
).order_by(
equities_cols.end_date.desc(),
@@ -302,7 +307,8 @@ class AssetFinder(object):
# end_date as a tie-breaker
elif len(candidates) > 1:
sid = sa.select((equities_cols.sid,)).where(
(equities_cols.symbol == symbol) &
(equities_cols.company_symbol == company_symbol) &
(equities_cols.share_class_symbol == share_class_symbol) &
(equities_cols.start_date <= ad_value),
).order_by(
equities_cols.start_date.desc(),
@@ -315,7 +321,8 @@ class AssetFinder(object):
else:
sids = sa.select((equities_cols.sid,)).where(
equities_cols.symbol == symbol,
(equities_cols.company_symbol == company_symbol) &
(equities_cols.share_class_symbol == share_class_symbol)
).execute().fetchall()
if len(sids) == 1:
return self._retrieve_equity(sids[0]['sid'])
@@ -330,7 +337,7 @@ class AssetFinder(object):
))
)
def lookup_symbol(self, symbol, as_of_date, fuzzy=False):
def lookup_symbol(self, symbol, as_of_date):
"""
If a fuzzy string is provided, then we try various symbols based on
the provided symbol. This is to facilitate mapping from a broker's
@@ -341,41 +348,11 @@ class AssetFinder(object):
"""
symbol = symbol.upper()
ad_value = pd.Timestamp(normalize_date(as_of_date)).value
if not fuzzy:
try:
return self.lookup_symbol_resolve_multiple(symbol, as_of_date)
except SymbolNotFound:
return None
fuzzy = symbol.replace(self.fuzzy_char, '')
equities_cols = self.equities.c
candidates = sa.select((equities_cols.sid,)).where(
(equities_cols.fuzzy == fuzzy) &
(equities_cols.start_date <= ad_value) &
(equities_cols.end_date >= ad_value),
).execute().fetchall()
# If one SID exists for symbol, return that symbol
if len(candidates) == 1:
return self._retrieve_equity(candidates[0]['sid'])
# If multiple SIDs exist for symbol, return latest start_date with
# end_date as a tie-breaker
elif candidates:
sid = sa.select((equities_cols.sid,)).where(
(equities_cols.symbol == symbol) &
(equities_cols.start_date <= ad_value),
).order_by(
equities_cols.start_date.desc(),
equities_cols.end_date.desc(),
).scalar()
if sid:
return self._retrieve_equity(sid)
raise SymbolNotFound(symbol=symbol)
try:
return self.lookup_symbol_resolve_multiple(symbol, as_of_date)
except SymbolNotFound:
return None
def lookup_future_chain(self, root_symbol, as_of_date, knowledge_date):
""" Return the futures chain for a given root symbol.