mirror of
https://github.com/wassname/catalyst.git
synced 2026-07-05 19:39:15 +08:00
ENH: Simplifies AssetFinder symbol lookup by making fuzzy lookup the default
This commit is contained in:
@@ -292,12 +292,11 @@ class AssetFinderTestCase(TestCase):
|
||||
[
|
||||
{
|
||||
'sid': i,
|
||||
'symbol': 'TEST@%d' % i,
|
||||
'symbol': 'TEST.%d' % i,
|
||||
'company_name': "company%d" % i,
|
||||
'start_date': as_of.value,
|
||||
'end_date': as_of.value,
|
||||
'exchange': uuid.uuid4().hex,
|
||||
'fuzzy': 'TEST%d' % i
|
||||
'exchange': uuid.uuid4().hex
|
||||
}
|
||||
for i in range(3)
|
||||
]
|
||||
@@ -308,25 +307,26 @@ class AssetFinderTestCase(TestCase):
|
||||
finder.retrieve_asset(i) for i in range(3)
|
||||
)
|
||||
|
||||
for i in range(2): # we do it twice to test for caching bugs
|
||||
# we do it twice to catch caching bugs
|
||||
for i in range(2):
|
||||
# Shouldn't find this with no fuzzy_str passed.
|
||||
self.assertIsNone(finder.lookup_symbol('test', as_of))
|
||||
self.assertIsNone(finder.lookup_symbol('test1', as_of))
|
||||
self.assertEqual(
|
||||
asset_1,
|
||||
finder.lookup_symbol('test@1', as_of)
|
||||
finder.lookup_symbol('test.1', as_of)
|
||||
)
|
||||
|
||||
# Adding an unnecessary fuzzy shouldn't matter.
|
||||
self.assertEqual(
|
||||
asset_1,
|
||||
finder.lookup_symbol('test@1', as_of, fuzzy=True)
|
||||
finder.lookup_symbol('test/1', as_of)
|
||||
)
|
||||
|
||||
# Shouldn't find this with no fuzzy_str passed.
|
||||
self.assertIsNone(finder.lookup_symbol('test1', as_of))
|
||||
# Should find exact match.
|
||||
self.assertEqual(
|
||||
asset_1,
|
||||
finder.lookup_symbol('test1', as_of, fuzzy=True),
|
||||
finder.lookup_symbol('test-1', as_of),
|
||||
)
|
||||
|
||||
def test_lookup_symbol_resolve_multiple(self):
|
||||
|
||||
@@ -4,7 +4,9 @@ from abc import (
|
||||
)
|
||||
from collections import namedtuple
|
||||
|
||||
import re
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from six import with_metaclass
|
||||
import sqlalchemy as sa
|
||||
|
||||
@@ -34,7 +36,10 @@ FUTURE_TABLE_FIELDS = ASSET_TABLE_FIELDS | {
|
||||
}
|
||||
|
||||
# Expected fields for an Equity's metadata
|
||||
EQUITY_TABLE_FIELDS = ASSET_TABLE_FIELDS
|
||||
EQUITY_TABLE_FIELDS = ASSET_TABLE_FIELDS | {
|
||||
'company_symbol',
|
||||
'share_class_symbol',
|
||||
}
|
||||
|
||||
EXCHANGE_TABLE_FIELDS = frozenset({
|
||||
'exchange',
|
||||
@@ -88,6 +93,43 @@ _root_symbols_defaults = {
|
||||
'exchange': None,
|
||||
}
|
||||
|
||||
# Fuzzy symbol delimiters that may break up a company symbol and share class
|
||||
_fuzzy_symbol_delimiter_regex = r'[./\-_]'
|
||||
_fuzzy_symbol_default_triggers = frozenset({np.nan, None, ''})
|
||||
|
||||
|
||||
def split_fuzzy_symbol(fuzzy_symbol):
|
||||
"""
|
||||
Takes in a symbol that may be fuzzy and splits it in to a company symbol
|
||||
and share class symbol.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fuzzy_symbol : str
|
||||
The possibly-fuzzy symbol to be split
|
||||
|
||||
Returns
|
||||
-------
|
||||
( str, str )
|
||||
A tuple of ( company_symbol, share_class_symbol )
|
||||
"""
|
||||
# return blank strings for any bad fuzzy symbols, like NaN or None
|
||||
if fuzzy_symbol in _fuzzy_symbol_default_triggers:
|
||||
return ('', '')
|
||||
|
||||
split_list = re.split(pattern=_fuzzy_symbol_delimiter_regex,
|
||||
string=fuzzy_symbol,
|
||||
maxsplit=1)
|
||||
|
||||
# Break the list up in to its two components, the company symbol and the
|
||||
# share class symbol
|
||||
company_symbol = split_list[0]
|
||||
if len(split_list) > 1:
|
||||
share_class_symbol = split_list[1]
|
||||
else:
|
||||
share_class_symbol = ''
|
||||
return (company_symbol, share_class_symbol)
|
||||
|
||||
|
||||
def _generate_output_dataframe(data_subset, defaults):
|
||||
"""
|
||||
@@ -163,7 +205,6 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
|
||||
"""
|
||||
def write_all(self,
|
||||
engine,
|
||||
fuzzy_char=None,
|
||||
allow_sid_assignment=True,
|
||||
constraints=True):
|
||||
""" Write pre-supplied data to SQLite.
|
||||
@@ -172,8 +213,6 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
|
||||
----------
|
||||
engine : Engine
|
||||
An SQLAlchemy engine to a SQL database.
|
||||
fuzzy_char : str, optional
|
||||
A string for use in fuzzy matching.
|
||||
allow_sid_assignment: bool, optional
|
||||
If True then the class can assign sids where necessary.
|
||||
constraints : bool, optional
|
||||
@@ -192,7 +231,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
|
||||
self._write_exchanges(data.exchanges, txn)
|
||||
self._write_root_symbols(data.root_symbols, txn)
|
||||
self._write_futures(data.futures, txn)
|
||||
self._write_equities(data.equities, fuzzy_char, txn)
|
||||
self._write_equities(data.equities, txn)
|
||||
|
||||
def _write_exchanges(self, exchanges, bind=None):
|
||||
recs = exchanges.reset_index().rename_axis(
|
||||
@@ -222,11 +261,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
|
||||
self.asset_router.insert().values([(record['sid'], 'future')])\
|
||||
.execute(bind=bind)
|
||||
|
||||
def _write_equities(self, equities, fuzzy_char, bind=None):
|
||||
# Apply fuzzy matching.
|
||||
if fuzzy_char:
|
||||
equities['fuzzy'] = equities['symbol'].str.replace(fuzzy_char, '')
|
||||
|
||||
def _write_equities(self, equities, bind=None):
|
||||
recs = equities.reset_index().rename_axis(
|
||||
{'index': 'sid'},
|
||||
1,
|
||||
@@ -258,6 +293,8 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
|
||||
primary_key=constraints,
|
||||
),
|
||||
sa.Column('symbol', sa.Text),
|
||||
sa.Column('company_symbol', sa.Text),
|
||||
sa.Column('share_class_symbol', sa.Text),
|
||||
sa.Column('asset_name', sa.Text),
|
||||
sa.Column('start_date', sa.Integer, default=0),
|
||||
sa.Column('end_date', sa.Integer),
|
||||
@@ -366,6 +403,15 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
|
||||
defaults=_equities_defaults,
|
||||
)
|
||||
|
||||
# Split symbols to company_symbols and share_class_symbols
|
||||
tuple_series = equities_output['symbol'].apply(split_fuzzy_symbol)
|
||||
split_symbols = pd.DataFrame(
|
||||
tuple_series.tolist(),
|
||||
columns=['company_symbol', 'share_class_symbol'],
|
||||
index=tuple_series.index
|
||||
)
|
||||
equities_output = equities_output.join(split_symbols)
|
||||
|
||||
# Convert date columns to UNIX Epoch integers (nanoseconds)
|
||||
equities_output['start_date'] = \
|
||||
equities_output['start_date'].apply(self.convert_datetime)
|
||||
|
||||
+16
-39
@@ -39,6 +39,7 @@ from zipline.assets import (
|
||||
from zipline.assets.asset_writer import (
|
||||
FUTURE_TABLE_FIELDS,
|
||||
EQUITY_TABLE_FIELDS,
|
||||
split_fuzzy_symbol,
|
||||
)
|
||||
|
||||
log = Logger('assets.py')
|
||||
@@ -273,13 +274,16 @@ class AssetFinder(object):
|
||||
if as_of_date is not None:
|
||||
as_of_date = pd.Timestamp(normalize_date(as_of_date))
|
||||
|
||||
company_symbol, share_class_symbol = split_fuzzy_symbol(symbol)
|
||||
|
||||
equities_cols = self.equities.c
|
||||
if as_of_date:
|
||||
ad_value = as_of_date.value
|
||||
|
||||
# If one SID exists for symbol, return that symbol
|
||||
candidates = sa.select((equities_cols.sid,)).where(
|
||||
(equities_cols.symbol == symbol) &
|
||||
(equities_cols.company_symbol == company_symbol) &
|
||||
(equities_cols.share_class_symbol == share_class_symbol) &
|
||||
(equities_cols.start_date <= ad_value) &
|
||||
(equities_cols.end_date >= ad_value),
|
||||
).execute().fetchall()
|
||||
@@ -290,7 +294,8 @@ class AssetFinder(object):
|
||||
# highest-but-not-over end_date
|
||||
elif not candidates:
|
||||
sid = sa.select((equities_cols.sid,)).where(
|
||||
(equities_cols.symbol == symbol) &
|
||||
(equities_cols.company_symbol == company_symbol) &
|
||||
(equities_cols.share_class_symbol == share_class_symbol) &
|
||||
(equities_cols.start_date <= ad_value),
|
||||
).order_by(
|
||||
equities_cols.end_date.desc(),
|
||||
@@ -302,7 +307,8 @@ class AssetFinder(object):
|
||||
# end_date as a tie-breaker
|
||||
elif len(candidates) > 1:
|
||||
sid = sa.select((equities_cols.sid,)).where(
|
||||
(equities_cols.symbol == symbol) &
|
||||
(equities_cols.company_symbol == company_symbol) &
|
||||
(equities_cols.share_class_symbol == share_class_symbol) &
|
||||
(equities_cols.start_date <= ad_value),
|
||||
).order_by(
|
||||
equities_cols.start_date.desc(),
|
||||
@@ -315,7 +321,8 @@ class AssetFinder(object):
|
||||
|
||||
else:
|
||||
sids = sa.select((equities_cols.sid,)).where(
|
||||
equities_cols.symbol == symbol,
|
||||
(equities_cols.company_symbol == company_symbol) &
|
||||
(equities_cols.share_class_symbol == share_class_symbol)
|
||||
).execute().fetchall()
|
||||
if len(sids) == 1:
|
||||
return self._retrieve_equity(sids[0]['sid'])
|
||||
@@ -330,7 +337,7 @@ class AssetFinder(object):
|
||||
))
|
||||
)
|
||||
|
||||
def lookup_symbol(self, symbol, as_of_date, fuzzy=False):
|
||||
def lookup_symbol(self, symbol, as_of_date):
|
||||
"""
|
||||
If a fuzzy string is provided, then we try various symbols based on
|
||||
the provided symbol. This is to facilitate mapping from a broker's
|
||||
@@ -341,41 +348,11 @@ class AssetFinder(object):
|
||||
"""
|
||||
|
||||
symbol = symbol.upper()
|
||||
ad_value = pd.Timestamp(normalize_date(as_of_date)).value
|
||||
|
||||
if not fuzzy:
|
||||
try:
|
||||
return self.lookup_symbol_resolve_multiple(symbol, as_of_date)
|
||||
except SymbolNotFound:
|
||||
return None
|
||||
|
||||
fuzzy = symbol.replace(self.fuzzy_char, '')
|
||||
|
||||
equities_cols = self.equities.c
|
||||
candidates = sa.select((equities_cols.sid,)).where(
|
||||
(equities_cols.fuzzy == fuzzy) &
|
||||
(equities_cols.start_date <= ad_value) &
|
||||
(equities_cols.end_date >= ad_value),
|
||||
).execute().fetchall()
|
||||
|
||||
# If one SID exists for symbol, return that symbol
|
||||
if len(candidates) == 1:
|
||||
return self._retrieve_equity(candidates[0]['sid'])
|
||||
|
||||
# If multiple SIDs exist for symbol, return latest start_date with
|
||||
# end_date as a tie-breaker
|
||||
elif candidates:
|
||||
sid = sa.select((equities_cols.sid,)).where(
|
||||
(equities_cols.symbol == symbol) &
|
||||
(equities_cols.start_date <= ad_value),
|
||||
).order_by(
|
||||
equities_cols.start_date.desc(),
|
||||
equities_cols.end_date.desc(),
|
||||
).scalar()
|
||||
if sid:
|
||||
return self._retrieve_equity(sid)
|
||||
|
||||
raise SymbolNotFound(symbol=symbol)
|
||||
try:
|
||||
return self.lookup_symbol_resolve_multiple(symbol, as_of_date)
|
||||
except SymbolNotFound:
|
||||
return None
|
||||
|
||||
def lookup_future_chain(self, root_symbol, as_of_date, knowledge_date):
|
||||
""" Return the futures chain for a given root symbol.
|
||||
|
||||
Reference in New Issue
Block a user