ENH: Adds fuzzy symbol look-up and makes it distinct from delimited symbols

2026-07-04 16:12:46 +08:00 · 2015-09-16 14:34:18 -04:00
parent d84bdefef8
commit c446e7f62a
3 changed files with 106 additions and 34 deletions
@@ -286,7 +286,7 @@ class AssetFinderTestCase(TestCase):
    def setUp(self):
        self.env = TradingEnvironment()

-    def test_lookup_symbol_fuzzy(self):
+    def test_lookup_symbol_delimited(self):
        as_of = pd.Timestamp('2013-01-01', tz='UTC')
        frame = pd.DataFrame.from_records(
            [
@@ -302,23 +302,60 @@ class AssetFinderTestCase(TestCase):
            ]
        )
        self.env.write_data(equities_df=frame)
-        finder = AssetFinder(self.env.engine, fuzzy_char='@')
+        finder = AssetFinder(self.env.engine)
        asset_0, asset_1, asset_2 = (
            finder.retrieve_asset(i) for i in range(3)
        )

        # we do it twice to catch caching bugs
        for i in range(2):
-            # Shouldn't find this with no fuzzy_str passed.
            self.assertIsNone(finder.lookup_symbol('test', as_of))
            self.assertIsNone(finder.lookup_symbol('test1', as_of))
-            self.assertEqual(asset_1, finder.lookup_symbol('test.1', as_of))
+            # '@' is not a supported delimiter
+            self.assertIsNone(finder.lookup_symbol('test@1', as_of))

            # Adding an unnecessary fuzzy shouldn't matter.
-            self.assertEqual(asset_1, finder.lookup_symbol('test/1', as_of))
+            for fuzzy_char in ['-', '/', '_', '.']:
+                self.assertEqual(
+                    asset_1,
+                    finder.lookup_symbol('test%s1' % fuzzy_char, as_of)
+                )

-            # Should find exact match.
-            self.assertEqual(asset_1, finder.lookup_symbol('test-1', as_of))
+    def test_lookup_symbol_fuzzy(self):
+        metadata = {
+            0: {'symbol': 'PRTY_HRD'},
+            1: {'symbol': 'BRKA'},
+            2: {'symbol': 'BRK_A'},
+        }
+        self.env.write_data(equities_data=metadata)
+        finder = self.env.asset_finder
+        dt = pd.Timestamp('2013-01-01', tz='UTC')
+
+        # Try combos of looking up PRTYHRD with and without a time or fuzzy
+        # Both non-fuzzys get no result
+        self.assertIsNone(finder.lookup_symbol('PRTYHRD', None))
+        self.assertIsNone(finder.lookup_symbol('PRTYHRD', dt))
+        # Both fuzzys work
+        self.assertEqual(0, finder.lookup_symbol('PRTYHRD', None, fuzzy=True))
+        self.assertEqual(0, finder.lookup_symbol('PRTYHRD', dt, fuzzy=True))
+
+        # Try combos of looking up PRTY_HRD, all returning sid 0
+        self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', None))
+        self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', dt))
+        self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', None, fuzzy=True))
+        self.assertEqual(0, finder.lookup_symbol('PRTY_HRD', dt, fuzzy=True))
+
+        # Try combos of looking up BRKA, all returning sid 1
+        self.assertEqual(1, finder.lookup_symbol('BRKA', None))
+        self.assertEqual(1, finder.lookup_symbol('BRKA', dt))
+        self.assertEqual(1, finder.lookup_symbol('BRKA', None, fuzzy=True))
+        self.assertEqual(1, finder.lookup_symbol('BRKA', dt, fuzzy=True))
+
+        # Try combos of looking up BRK_A, all returning sid 2
+        self.assertEqual(2, finder.lookup_symbol('BRK_A', None))
+        self.assertEqual(2, finder.lookup_symbol('BRK_A', dt))
+        self.assertEqual(2, finder.lookup_symbol('BRK_A', None, fuzzy=True))
+        self.assertEqual(2, finder.lookup_symbol('BRK_A', dt, fuzzy=True))

    def test_lookup_symbol(self):

@@ -39,6 +39,7 @@ FUTURE_TABLE_FIELDS = ASSET_TABLE_FIELDS | {
 EQUITY_TABLE_FIELDS = ASSET_TABLE_FIELDS | {
    'company_symbol',
    'share_class_symbol',
+    'fuzzy_symbol',
 }

 EXCHANGE_TABLE_FIELDS = frozenset({
@@ -94,31 +95,32 @@ _root_symbols_defaults = {
 }

 # Fuzzy symbol delimiters that may break up a company symbol and share class
-_fuzzy_symbol_delimiter_regex = r'[./\-_]'
-_fuzzy_symbol_default_triggers = frozenset({np.nan, None, ''})
+_delimited_symbol_delimiter_regex = r'[./\-_]'
+_delimited_symbol_default_triggers = frozenset({np.nan, None, ''})


-def split_fuzzy_symbol(fuzzy_symbol):
+def split_delimited_symbol(symbol):
    """
-    Takes in a symbol that may be fuzzy and splits it in to a company symbol
-    and share class symbol.
+    Takes in a symbol that may be delimited and splits it in to a company
+    symbol and share class symbol. Also returns the fuzzy symbol, which is the
+    symbol without any fuzzy characters at all.

    Parameters
    ----------
-    fuzzy_symbol : str
-        The possibly-fuzzy symbol to be split
+    symbol : str
+        The possibly-delimited symbol to be split

    Returns
    -------
-    ( str, str )
-        A tuple of ( company_symbol, share_class_symbol )
+    ( str, str , str )
+        A tuple of ( company_symbol, share_class_symbol, fuzzy_symbol)
    """
    # return blank strings for any bad fuzzy symbols, like NaN or None
-    if fuzzy_symbol in _fuzzy_symbol_default_triggers:
-        return ('', '')
+    if symbol in _delimited_symbol_default_triggers:
+        return ('', '', '')

-    split_list = re.split(pattern=_fuzzy_symbol_delimiter_regex,
-                          string=fuzzy_symbol,
+    split_list = re.split(pattern=_delimited_symbol_delimiter_regex,
+                          string=symbol,
                          maxsplit=1)

    # Break the list up in to its two components, the company symbol and the
@@ -128,7 +130,13 @@ def split_fuzzy_symbol(fuzzy_symbol):
        share_class_symbol = split_list[1]
    else:
        share_class_symbol = ''
-    return (company_symbol, share_class_symbol)
+
+    # Strip all fuzzy characters from the symbol to get the fuzzy symbol
+    fuzzy_symbol = re.sub(pattern=_delimited_symbol_delimiter_regex,
+                          repl='',
+                          string=symbol)
+
+    return (company_symbol, share_class_symbol, fuzzy_symbol)


 def _generate_output_dataframe(data_subset, defaults):
@@ -194,8 +202,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)):

    Methods
    -------
-    write_all(engine, fuzzy_char=None, allow_sid_assignment=True,
-              constraints=False)
+    write_all(engine, allow_sid_assignment=True, constraints=False)
        Write the data supplied at initialization to the database.
    init_db(engine, constraints=False)
        Create the SQLite tables (called by write_all).
@@ -295,6 +302,7 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
            sa.Column('symbol', sa.Text),
            sa.Column('company_symbol', sa.Text),
            sa.Column('share_class_symbol', sa.Text),
+            sa.Column('fuzzy_symbol', sa.Text),
            sa.Column('asset_name', sa.Text),
            sa.Column('start_date', sa.Integer, default=0),
            sa.Column('end_date', sa.Integer),
@@ -404,10 +412,10 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
        )

        # Split symbols to company_symbols and share_class_symbols
-        tuple_series = equities_output['symbol'].apply(split_fuzzy_symbol)
+        tuple_series = equities_output['symbol'].apply(split_delimited_symbol)
        split_symbols = pd.DataFrame(
            tuple_series.tolist(),
-            columns=['company_symbol', 'share_class_symbol'],
+            columns=['company_symbol', 'share_class_symbol', 'fuzzy_symbol'],
            index=tuple_series.index
        )
        equities_output = equities_output.join(split_symbols)
@@ -419,6 +427,8 @@ class AssetDBWriter(with_metaclass(ABCMeta)):
            equities_output.company_symbol.str.upper()
        equities_output['share_class_symbol'] = \
            equities_output.share_class_symbol.str.upper()
+        equities_output['fuzzy_symbol'] = \
+            equities_output.fuzzy_symbol.str.upper()

        # Convert date columns to UNIX Epoch integers (nanoseconds)
        equities_output['start_date'] = \
@@ -39,7 +39,7 @@ from zipline.assets import (
 from zipline.assets.asset_writer import (
    FUTURE_TABLE_FIELDS,
    EQUITY_TABLE_FIELDS,
-    split_fuzzy_symbol,
+    split_delimited_symbol,
 )

 log = Logger('assets.py')
@@ -88,9 +88,8 @@ class AssetFinder(object):
    # reference to an AssetFinder
    PERSISTENT_TOKEN = "<AssetFinder>"

-    def __init__(self, engine, allow_sid_assignment=True, fuzzy_char=None):
+    def __init__(self, engine, allow_sid_assignment=True):

-        self.fuzzy_char = fuzzy_char
        self.allow_sid_assignment = allow_sid_assignment

        self.engine = engine
@@ -262,14 +261,15 @@ class AssetFinder(object):
        self._future_cache[sid] = future
        return future

-    def lookup_symbol(self, symbol, as_of_date, default_None=True):
+    def lookup_symbol(self, symbol, as_of_date, default_None=True,
+                      fuzzy=False):
        """
-        Return matching Asset of name symbol in database.
+        Return matching Equity of name symbol in database.

-        If multiple Assets are found and as_of_date is not set,
+        If multiple Equities are found and as_of_date is not set,
        raises MultipleSymbolsFound.

-        If no Asset was active at as_of_date raises SymbolNotFound, or None
+        If no Equity was active at as_of_date raises SymbolNotFound, or None
        if default_None is true.
        """

@@ -278,19 +278,35 @@ class AssetFinder(object):
        if as_of_date is not None:
            as_of_date = pd.Timestamp(normalize_date(as_of_date))

-        company_symbol, share_class_symbol = split_fuzzy_symbol(symbol)
+        company_symbol, share_class_symbol, fuzzy_symbol = \
+            split_delimited_symbol(symbol)

        equities_cols = self.equities.c
        if as_of_date:
            ad_value = as_of_date.value

-            # If one SID exists for symbol, return that symbol
+            if fuzzy:
+                # Search for a single exact match on the fuzzy column
+                fuzzy_candidates = sa.select((equities_cols.sid,)).where(
+                    (equities_cols.fuzzy_symbol == fuzzy_symbol) &
+                    (equities_cols.start_date <= ad_value) &
+                    (equities_cols.end_date >= ad_value),
+                ).execute().fetchall()
+
+                # If exactly one SID exists for fuzzy_symbol, return that sid
+                if len(fuzzy_candidates) == 1:
+                    return self._retrieve_equity(fuzzy_candidates[0]['sid'])
+
+            # Search for exact matches of the split-up company_symbol and
+            # share_class_symbol
            candidates = sa.select((equities_cols.sid,)).where(
                (equities_cols.company_symbol == company_symbol) &
                (equities_cols.share_class_symbol == share_class_symbol) &
                (equities_cols.start_date <= ad_value) &
                (equities_cols.end_date >= ad_value),
            ).execute().fetchall()
+
+            # If exactly one SID exists for symbol, return that symbol
            if len(candidates) == 1:
                return self._retrieve_equity(candidates[0]['sid'])

@@ -327,6 +343,15 @@ class AssetFinder(object):
                raise SymbolNotFound(symbol=symbol)

        else:
+            # If this is a fuzzy look-up, check if there is exactly one match
+            # for the fuzzy symbol
+            if fuzzy:
+                fuzzy_sids = sa.select((equities_cols.sid,)).where(
+                    (equities_cols.fuzzy_symbol == fuzzy_symbol)
+                ).execute().fetchall()
+                if len(fuzzy_sids) == 1:
+                    return self._retrieve_equity(fuzzy_sids[0]['sid'])
+
            sids = sa.select((equities_cols.sid,)).where(
                (equities_cols.company_symbol == company_symbol) &
                (equities_cols.share_class_symbol == share_class_symbol)