From 035bfbd514cde8260c70ba09bbaf024846202ce6 Mon Sep 17 00:00:00 2001
From: jfkirk <james.f.kirk@gmail.com>
Date: Mon, 8 Jun 2015 15:53:48 -0400
Subject: [PATCH] ENH: Adds AssetFinder and asset metadata management

---
 tests/test_assets.py       | 500 +++++++++++++++++++++++++++++++++-
 zipline/assets/__init__.py |  22 ++
 zipline/assets/_assets.pyx |  86 ++++--
 zipline/assets/assets.py   | 541 +++++++++++++++++++++++++++++++++++++
 4 files changed, 1115 insertions(+), 34 deletions(-)
 create mode 100644 zipline/assets/assets.py

diff --git a/tests/test_assets.py b/tests/test_assets.py
index fea726ab..8fd41cc7 100644
--- a/tests/test_assets.py
+++ b/tests/test_assets.py
@@ -20,7 +20,177 @@ Tests for the zipline.assets package
 import sys
 from unittest import TestCase
 
-from zipline.assets._assets import Asset, Future
+from datetime import (
+    datetime,
+    timedelta,
+)
+import pickle
+import pprint
+import pytz
+import uuid
+
+import pandas as pd
+
+from nose_parameterized import parameterized
+
+from zipline.finance.trading import with_environment
+from zipline.assets import Asset, Future, AssetFinder
+from zipline.errors import (
+    SymbolNotFound,
+    MultipleSymbolsFound,
+)
+
+
+class FakeTable(object):
+    def __init__(self, name, count, dt, fuzzy_str):
+        self.name = name
+        self.count = count
+        self.dt = dt
+        self.fuzzy_str = fuzzy_str
+        self.df = pd.DataFrame.from_records(
+            [
+                {
+                    'sid': i,
+                    'file_name':  'TEST%s%s' % (self.fuzzy_str, i),
+                    'company_name': self.name + str(i),
+                    'start_date_nano': pd.Timestamp(dt, tz='UTC').value,
+                    'end_date_nano': pd.Timestamp(dt, tz='UTC').value,
+                    'exchange': self.name,
+                }
+                for i in range(1, self.count + 1)
+            ]
+        )
+
+    def read(self, *args, **kwargs):
+        return self.df.to_records()
+
+
+class FakeTableIdenticalSymbols(object):
+    def __init__(self, name, as_of_dates):
+        self.name = name
+        self.as_of_dates = as_of_dates
+        self.df = pd.DataFrame.from_records(
+            [
+                {
+                    'sid': i,
+                    'file_name':  self.name,
+                    'company_name': self.name,
+                    'start_date_nano': date.value,
+                    'end_date_nano': (date + timedelta(days=1)).value,
+                    'exchange': self.name,
+                }
+                for i, date in enumerate(self.as_of_dates)
+            ]
+        )
+
+    def read(self, *args, **kwargs):
+        return self.df.to_records()
+
+
+class FakeTableFromRecords(object):
+
+    def __init__(self, records):
+        self.records = records
+        self.df = pd.DataFrame.from_records(self.records)
+
+    def read(self, *args, **kwargs):
+        return self.df.to_records()
+
+
+@with_environment()
+def build_lookup_generic_cases(env=None):
+    """
+    Generate test cases for AssetFinder test_lookup_generic.
+    """
+
+    unique_start = pd.Timestamp('2013-01-01', tz='UTC')
+    unique_end = pd.Timestamp('2014-01-01', tz='UTC')
+
+    dupe_0_start = pd.Timestamp('2013-01-01', tz='UTC')
+    dupe_0_end = dupe_0_start + timedelta(days=1)
+
+    dupe_1_start = pd.Timestamp('2013-01-03', tz='UTC')
+    dupe_1_end = dupe_1_start + timedelta(days=1)
+
+    table = FakeTableFromRecords(
+        [
+            {
+                'sid': 0,
+                'file_name':  'duplicated',
+                'company_name': 'duplicated_0',
+                'start_date_nano': dupe_0_start.value,
+                'end_date_nano': dupe_0_end.value,
+                'exchange': '',
+            },
+            {
+                'sid': 1,
+                'file_name':  'duplicated',
+                'company_name': 'duplicated_1',
+                'start_date_nano': dupe_1_start.value,
+                'end_date_nano': dupe_1_end.value,
+                'exchange': '',
+            },
+            {
+                'sid': 2,
+                'file_name':  'unique',
+                'company_name': 'unique',
+                'start_date_nano': unique_start.value,
+                'end_date_nano': unique_end.value,
+                'exchange': '',
+            },
+        ],
+    )
+    env.update_asset_finder(asset_metadata=table.df)
+    dupe_0, dupe_1, unique = assets = [
+        env.asset_finder.retrieve_asset(i)
+        for i in range(3)
+    ]
+
+    # This expansion code is run at module import time, which means we have to
+    # clear the AssetFinder here or else it will interfere with the cache
+    # for other tests.
+    env.update_asset_finder(clear_metadata=True)
+
+    dupe_0_start = dupe_0.start_date
+    dupe_1_start = dupe_1.start_date
+    cases = [
+        ##
+        # Scalars
+
+        # Asset object
+        (table, assets[0], None, assets[0]),
+        (table, assets[1], None, assets[1]),
+        (table, assets[2], None, assets[2]),
+        # int
+        (table, 0, None, assets[0]),
+        (table, 1, None, assets[1]),
+        (table, 2, None, assets[2]),
+        # Duplicated symbol with resolution date
+        (table, 'duplicated', dupe_0_start, dupe_0),
+        (table, 'duplicated', dupe_1_start, dupe_1),
+        # Unique symbol, with or without resolution date.
+        (table, 'unique', unique_start, unique),
+        (table, 'unique', None, unique),
+
+        ##
+        # Iterables
+
+        # Iterables of Asset objects.
+        (table, assets, None, assets),
+        (table, iter(assets), None, assets),
+        # Iterables of ints
+        (table, (0, 1), None, assets[:-1]),
+        (table, iter((0, 1)), None, assets[:-1]),
+        # Iterables of symbols.
+        (table, ('duplicated', 'unique'), dupe_0_start, [dupe_0, unique]),
+        (table, ('duplicated', 'unique'), dupe_1_start, [dupe_1, unique]),
+        # Mixed types
+        (table,
+         ('duplicated', 2, 'unique', 1, dupe_1),
+         dupe_0_start,
+         [dupe_0, assets[2], unique, assets[1], dupe_1]),
+    ]
+    return cases
 
 
 class AssetTestCase(TestCase):
@@ -35,8 +205,53 @@ class AssetTestCase(TestCase):
 
         self.assertEquals(str(Asset(5061)), 'Asset(5061)')
 
+    def test_asset_is_pickleable(self):
 
-class TestAssetRichCmp(TestCase):
+        # Very wow
+        s = Asset(
+            1337,
+            symbol="DOGE",
+            asset_name="DOGECOIN",
+            start_date=pd.Timestamp('2013-12-08 9:31AM', tz='UTC'),
+            end_date=pd.Timestamp('2014-06-25 11:21AM', tz='UTC'),
+            first_traded=pd.Timestamp('2013-12-08 9:31AM', tz='UTC'),
+            exchange='THE MOON',
+        )
+        s_unpickled = pickle.loads(pickle.dumps(s))
+
+        attrs_to_check = ['end_date',
+                          'exchange',
+                          'first_traded',
+                          'asset_end_date',
+                          'asset_name',
+                          'asset_start_date',
+                          'sid',
+                          'start_date',
+                          'symbol']
+
+        for attr in attrs_to_check:
+            self.assertEqual(getattr(s, attr), getattr(s_unpickled, attr))
+
+    def test_asset_comparisons(self):
+
+        s_23 = Asset(23)
+        s_24 = Asset(24)
+
+        self.assertEqual(s_23, s_23)
+        self.assertEqual(s_23, 23)
+        self.assertEqual(23, s_23)
+
+        self.assertNotEqual(s_23, s_24)
+        self.assertNotEqual(s_23, 24)
+        self.assertNotEqual(s_23, "23")
+        self.assertNotEqual(s_23, 23.5)
+        self.assertNotEqual(s_23, [])
+        self.assertNotEqual(s_23, None)
+
+        self.assertLess(s_23, s_24)
+        self.assertLess(s_23, 24)
+        self.assertGreater(24, s_23)
+        self.assertGreater(s_24, s_23)
 
     def test_lt(self):
         self.assertTrue(Asset(3) < Asset(4))
@@ -74,16 +289,279 @@ class TestAssetRichCmp(TestCase):
                 'a' < Asset(3)
 
 
-class testFuture(TestCase):
+class TestFuture(TestCase):
+
+    future = Future(2468,
+                    symbol='OMK15',
+                    notice_date='2014-01-20',
+                    expiration_date='2014-02-20',
+                    contract_multiplier=500)
+
+    def test_str(self):
+        strd = self.future.__str__()
+        self.assertEqual("Future(2468 [OMK15])", strd)
 
     def test_repr(self):
+        reprd = self.future.__repr__()
+        self.assertTrue("Future" in reprd)
+        self.assertTrue("2468" in reprd)
+        self.assertTrue("OMK15" in reprd)
+        self.assertTrue("notice_date='2014-01-20'" in reprd)
+        self.assertTrue("expiration_date='2014-02-20'" in reprd)
+        self.assertTrue("contract_multiplier=500" in reprd)
 
-        future = Future(2468,
-                        notice_date='2014-01-20',
-                        expiration_date='2014-02-20')
-        rep = future.__repr__()
+    def test_reduce(self):
+        reduced = self.future.__reduce__()
+        self.assertEqual(Future, reduced[0])
 
-        self.assertTrue("Future" in rep)
-        self.assertTrue("2468" in rep)
-        self.assertTrue("notice_date='2014-01-20'" in rep)
-        self.assertTrue("expiration_date='2014-02-20'" in rep)
+    def test_to_and_from_dict(self):
+        dictd = self.future.to_dict()
+        self.assertTrue('notice_date' in dictd)
+        self.assertTrue('expiration_date' in dictd)
+        self.assertTrue('contract_multiplier' in dictd)
+
+        from_dict = Future.from_dict(dictd)
+        self.assertTrue(isinstance(from_dict, Future))
+        self.assertEqual(self.future, from_dict)
+
+
+class AssetFinderTestCase(TestCase):
+
+    @with_environment()
+    def test_lookup_symbol_fuzzy(self, env=None):
+        fuzzy_str = '@'
+        as_of_date = datetime(2013, 1, 1, tzinfo=pytz.utc)
+        table = FakeTable(uuid.uuid4().hex, 2, as_of_date,
+                          fuzzy_str)
+        env.update_asset_finder(asset_metadata=table.df)
+        sf = env.asset_finder
+
+        try:
+            for i in range(2):  # we do it twice to test for caching bugs
+                self.assertIsNone(sf.lookup_symbol('test', as_of_date))
+                self.assertIsNotNone(sf.lookup_symbol(
+                    'test%s%s' % (fuzzy_str, 1), as_of_date))
+                self.assertIsNone(sf.lookup_symbol('test%s' % 1, as_of_date))
+
+                self.assertIsNone(sf.lookup_symbol(table.name, as_of_date,
+                                                   fuzzy=fuzzy_str))
+                self.assertIsNotNone(sf.lookup_symbol(
+                    'test%s%s' % (fuzzy_str, 1), as_of_date, fuzzy=fuzzy_str))
+                self.assertIsNotNone(sf.lookup_symbol(
+                    'test%s' % 1, as_of_date, fuzzy=fuzzy_str))
+        finally:
+            env.update_asset_finder(clear_metadata=True)
+
+    @with_environment()
+    def test_lookup_symbol_resolve_multiple(self, env=None):
+
+        as_of_dates = [
+            pd.Timestamp('2013-01-01', tz='UTC') + timedelta(days=i)
+            # Incrementing by two so that start and end dates for each
+            # generated Asset don't overlap (each Asset's end_date is the
+            # day after its start date.)
+            for i in range(0, 10, 2)
+        ]
+
+        table = FakeTableIdenticalSymbols(
+            name='existing',
+            as_of_dates=as_of_dates,
+        )
+        env.update_asset_finder(asset_metadata=table.df)
+        sf = env.asset_finder
+
+        try:
+            for _ in range(2):  # we do it twice to test for caching bugs
+                with self.assertRaises(SymbolNotFound):
+                    sf.lookup_symbol_resolve_multiple('non_existing',
+                                                      as_of_dates[0])
+                with self.assertRaises(MultipleSymbolsFound):
+                    sf.lookup_symbol_resolve_multiple('existing',
+                                                      None)
+
+                for i, date in enumerate(as_of_dates):
+                    # Verify that we correctly resolve multiple symbols using
+                    # the supplied date
+                    result = sf.lookup_symbol_resolve_multiple(
+                        'existing',
+                        date,
+                    )
+                    self.assertEqual(result.symbol, 'existing')
+                    self.assertEqual(result.sid, i)
+
+        finally:
+            env.update_asset_finder(clear_metadata=True)
+
+    @with_environment()
+    def test_lookup_symbol_nasdaq_underscore_collisions(self, env=None):
+        """
+        Ensure that each NASDAQ symbol without underscores maps back to the
+        original symbol when using fuzzy matching.
+        """
+        sf = env.asset_finder
+        fuzzy_str = '_'
+        collisions = []
+
+        try:
+            for sid in sf.sids:
+                sec = sf.retrieve_asset(sid)
+                if sec.exchange.startswith('NASDAQ'):
+                    found = sf.lookup_symbol(sec.symbol.replace(fuzzy_str, ''),
+                                             sec.end_date, fuzzy=fuzzy_str)
+                    if found != sec:
+                        collisions.append((found, sec))
+
+            # KNOWN BUG: Filter out assets that have intersections in their
+            # start and end dates.  We can't correctly resolve these.
+            unexpected_errors = []
+            for first, second in collisions:
+                overlapping_dates = (
+                    first.end_date >= second.start_date or
+                    second.end_date >= first.end_date
+                )
+                if not overlapping_dates:
+                    unexpected_errors.append((first, second))
+
+            self.assertFalse(
+                unexpected_errors,
+                pprint.pformat(unexpected_errors),
+            )
+        finally:
+            env.update_asset_finder(clear_metadata=True)
+
+    @parameterized.expand(
+        build_lookup_generic_cases()
+    )
+    @with_environment()
+    def test_lookup_generic(self, table, symbols, reference_date, expected,
+                            env=None):
+        """
+        Ensure that lookup_generic works with various permutations of inputs.
+        """
+        try:
+            env.update_asset_finder(asset_metadata=table.df)
+            finder = env.asset_finder
+            results, missing = finder.lookup_generic(symbols, reference_date)
+            self.assertEqual(results, expected)
+            self.assertEqual(missing, [])
+        finally:
+            env.update_asset_finder(clear_metadata=True)
+
+    @with_environment()
+    def test_lookup_generic_handle_missing(self, env=None):
+        try:
+            table = FakeTableFromRecords(
+                [
+                    # Sids that will be found when we do lookups.
+                    {
+                        'sid': 0,
+                        'file_name': 'real',
+                        'company_name': 'real',
+                        'start_date_nano': pd.Timestamp('2013-1-1', tz='UTC'),
+                        'end_date_nano': pd.Timestamp('2014-1-1', tz='UTC'),
+                        'exchange': '',
+                    },
+                    {
+                        'sid': 1,
+                        'file_name': 'also_real',
+                        'company_name': 'also_real',
+                        'start_date_nano': pd.Timestamp('2013-1-1', tz='UTC'),
+                        'end_date_nano': pd.Timestamp('2014-1-1', tz='UTC'),
+                        'exchange': '',
+                    },
+                    # Sid whose end date is before our query date.  We should
+                    # still correctly find it.
+                    {
+                        'sid': 2,
+                        'file_name': 'real_but_old',
+                        'company_name': 'real_but_old',
+                        'start_date_nano': pd.Timestamp('2002-1-1', tz='UTC'),
+                        'end_date_nano': pd.Timestamp('2003-1-1', tz='UTC'),
+                        'exchange': '',
+                    },
+                    # Sid whose end date is before our query date.  We should
+                    # still correctly find it.
+                    {
+                        'sid': 3,
+                        'file_name': 'real_but_in_the_future',
+                        'company_name': 'real_but_in_the_future',
+                        'start_date_nano': pd.Timestamp('2014-1-1', tz='UTC'),
+                        'end_date_nano': pd.Timestamp('2020-1-1', tz='UTC'),
+                        'exchange': 'THE FUTURE',
+                    },
+                ]
+            )
+            env.update_asset_finder(asset_metadata=table.df)
+            symbols = [
+                'real', 1, 'fake', 'real_but_old', 'real_but_in_the_future',
+            ]
+
+            results, missing = env.asset_finder.lookup_generic(
+                symbols,
+                pd.Timestamp('2013-02-01', tz='UTC'),
+            )
+
+            self.assertEqual(len(results), 3)
+            self.assertEqual(results[0].symbol, 'real')
+            self.assertEqual(results[0].sid, 0)
+            self.assertEqual(results[1].symbol, 'also_real')
+            self.assertEqual(results[1].sid, 1)
+
+            self.assertEqual(len(missing), 2)
+            self.assertEqual(missing[0], 'fake')
+            self.assertEqual(missing[1], 'real_but_in_the_future')
+
+        finally:
+            env.update_asset_finder(clear_metadata=True)
+
+    def test_insert_metadata(self):
+        finder = AssetFinder()
+        finder.insert_metadata(0,
+                               asset_type='equity',
+                               start_date='2014-01-01',
+                               end_date='2015-01-01',
+                               symbol="PLAY",
+                               foo_data="FOO",)
+
+        # Test proper insertion
+        self.assertEqual('equity', finder.metadata_cache[0]['asset_type'])
+        self.assertEqual('PLAY', finder.metadata_cache[0]['symbol'])
+        self.assertEqual('2015-01-01', finder.metadata_cache[0]['end_date'])
+
+        # Test invalid field
+        self.assertFalse('foo_data' in finder.metadata_cache[0])
+
+        # Test updating fields
+        finder.insert_metadata(0,
+                               asset_type='equity',
+                               start_date='2014-01-01',
+                               end_date='2015-02-01',
+                               symbol="PLAY",
+                               exchange="NYSE",)
+        self.assertEqual('2015-02-01', finder.metadata_cache[0]['end_date'])
+        self.assertEqual('NYSE', finder.metadata_cache[0]['exchange'])
+
+        # Check that old data survived
+        self.assertEqual('PLAY', finder.metadata_cache[0]['symbol'])
+
+    def test_consume_metadata(self):
+
+        # Test dict consumption
+        finder = AssetFinder({0: {'asset_type': 'equity'}})
+        dict_to_consume = {0: {'symbol': 'PLAY'},
+                           1: {'symbol': 'MSFT'}}
+        finder.consume_metadata(dict_to_consume)
+        self.assertEqual('equity', finder.metadata_cache[0]['asset_type'])
+        self.assertEqual('PLAY', finder.metadata_cache[0]['symbol'])
+
+        # Test dataframe consumption
+        df = pd.DataFrame(columns=['asset_name', 'exchange'], index=[0, 1])
+        df['asset_name'][0] = "Dave'N'Busters"
+        df['exchange'][0] = "NASDAQ"
+        df['asset_name'][1] = "Microsoft"
+        df['exchange'][1] = "NYSE"
+        finder.consume_metadata(df)
+        self.assertEqual('NASDAQ', finder.metadata_cache[0]['exchange'])
+        self.assertEqual('Microsoft', finder.metadata_cache[1]['asset_name'])
+        # Check that old data survived
+        self.assertEqual('equity', finder.metadata_cache[0]['asset_type'])
diff --git a/zipline/assets/__init__.py b/zipline/assets/__init__.py
index 73d0bbd1..b088688e 100644
--- a/zipline/assets/__init__.py
+++ b/zipline/assets/__init__.py
@@ -12,3 +12,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from ._assets import (
+    Asset,
+    Equity,
+    Future,
+    make_asset_array,
+    CACHE_FILE_TEMPLATE
+)
+from .assets import (
+    AssetFinder,
+    AssetConvertible
+)
+
+__all__ = [
+    'Asset',
+    'Equity',
+    'Future',
+    'AssetFinder',
+    'AssetConvertible',
+    'make_asset_array',
+    'CACHE_FILE_TEMPLATE'
+]
diff --git a/zipline/assets/_assets.pyx b/zipline/assets/_assets.pyx
index 16b8ba06..89d6761b 100644
--- a/zipline/assets/_assets.pyx
+++ b/zipline/assets/_assets.pyx
@@ -21,9 +21,10 @@ cimport cython
 import numpy as np
 cimport numpy as np
 
-cdef enum AssetType:
-    EQUITY = 1
-    FUTURE = 2
+# IMPORTANT NOTE: You must change this template if you change
+# Asset.__reduce__, or else we'll attempt to unpickle an old version of this
+# class
+CACHE_FILE_TEMPLATE = '/tmp/.%s-%s.v4.cache'
 
 cdef class Asset:
 
@@ -33,9 +34,7 @@ cdef class Asset:
 
     cdef readonly object symbol
     cdef readonly object asset_name
-    cdef readonly AssetType asset_type
 
-    # TODO: Maybe declare as pandas Timestamp?
     cdef readonly object start_date
     cdef readonly object end_date
     cdef public object first_traded
@@ -179,26 +178,21 @@ cdef class Asset:
             'exchange': self.exchange,
         }
 
-    @staticmethod
-    def from_dict(dict_):
+    @classmethod
+    def from_dict(cls, dict_):
         """
         Build an Asset instance from a dict.
         """
-        return Asset(**dict_)
+        return cls(**dict_)
 
 
 cdef class Equity(Asset):
 
-    def __cinit__(self,
-                  int sid, # sid is required
-                  object symbol="",
-                  object asset_name="",
-                  object start_date=None,
-                  object end_date=None,
-                  object first_traded=None,
-                  object exchange=""):
-
-        self.asset_type = EQUITY
+    def __str__(self):
+        if self.symbol:
+            return 'Equity(%d [%s])' % (self.sid, self.symbol)
+        else:
+            return 'Equity(%d)' % self.sid
 
     def __repr__(self):
         attrs = ('symbol', 'asset_name', 'exchange',
@@ -214,6 +208,7 @@ cdef class Future(Asset):
 
     cdef readonly object notice_date
     cdef readonly object expiration_date
+    cdef readonly int contract_multiplier
 
     def __cinit__(self,
                   int sid, # sid is required
@@ -224,18 +219,63 @@ cdef class Future(Asset):
                   object notice_date=None,
                   object expiration_date=None,
                   object first_traded=None,
-                  object exchange=""):
+                  object exchange="",
+                  int contract_multiplier=1):
 
-        self.asset_type       = FUTURE
-        self.notice_date      = notice_date
-        self.expiration_date  = expiration_date
+        self.notice_date         = notice_date
+        self.expiration_date     = expiration_date
+        self.contract_multiplier = contract_multiplier
+
+        # Assign the expiration as the end_date if end_date is not explicit
+        if self.end_date is None:
+            self.end_date = expiration_date
+
+    def __str__(self):
+        if self.symbol:
+            return 'Future(%d [%s])' % (self.sid, self.symbol)
+        else:
+            return 'Future(%d)' % self.sid
 
     def __repr__(self):
         attrs = ('symbol', 'asset_name', 'exchange',
                  'start_date', 'end_date', 'first_traded', 'notice_date',
-                 'expiration_date')
+                 'expiration_date', 'contract_multiplier')
         tuples = ((attr, repr(getattr(self, attr, None)))
                   for attr in attrs)
         strings = ('%s=%s' % (t[0], t[1]) for t in tuples)
         params = ', '.join(strings)
         return 'Future(%d, %s)' % (self.sid, params)
+
+    cpdef __reduce__(self):
+        """
+        Function used by pickle to determine how to serialize/deserialize this
+        class.  Should return a tuple whose first element is self.__class__,
+        and whose second element is a tuple of all the attributes that should
+        be serialized/deserialized during pickling.
+        """
+        return (self.__class__, (self.sid,
+                                 self.symbol,
+                                 self.asset_name,
+                                 self.start_date,
+                                 self.end_date,
+                                 self.notice_date,
+                                 self.expiration_date,
+                                 self.first_traded,
+                                 self.exchange,
+                                 self.contract_multiplier,))
+
+    cpdef to_dict(self):
+        """
+        Convert to a python dict.
+        """
+        super_dict = super(Future, self).to_dict()
+        super_dict['notice_date'] = self.notice_date
+        super_dict['expiration_date'] = self.expiration_date
+        super_dict['contract_multiplier'] = self.contract_multiplier
+        return super_dict
+
+
+def make_asset_array(int size, Asset asset):
+    cdef np.ndarray out = np.empty([size], dtype=object)
+    out.fill(asset)
+    return out
diff --git a/zipline/assets/assets.py b/zipline/assets/assets.py
new file mode 100644
index 00000000..8cc234b3
--- /dev/null
+++ b/zipline/assets/assets.py
@@ -0,0 +1,541 @@
+#
+# Copyright 2015 Quantopian, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABCMeta
+from itertools import chain
+from numbers import Integral
+import numpy as np
+import operator
+
+from logbook import Logger
+import pandas as pd
+from pandas.tseries.tools import normalize_date
+from six import with_metaclass, string_types
+
+from zipline.errors import (
+    SymbolNotFound,
+    MultipleSymbolsFound,
+    SidNotFound,
+    IdentifierNotFound,
+    ConsumeAssetMetaDataError,
+    InvalidAssetType,
+)
+from zipline.utils import tradingcalendar
+from zipline.assets._assets import (
+    Asset, Equity, Future
+)
+
+log = Logger('assets.py')
+
+# Expected fields for an Asset's metadata
+ASSET_FIELDS = [
+    'sid',
+    'asset_type',
+    'symbol',
+    'asset_name',
+    'start_date',
+    'end_date',
+    'first_traded',
+    'exchange',
+    'notice_date',
+    'expiration_date',
+    'contract_multiplier',
+    # The following fields are for compatibility with other systems
+    'file_name',  # Used as symbol
+    'company_name',  # Used as asset_name
+    'start_date_nano',  # Used as start_date
+    'end_date_nano',  # Used as end_date
+]
+
+
+class AssetFinder(object):
+
+    def __init__(self,
+                 metadata=None,
+                 trading_calendar=tradingcalendar):
+
+        # Any particular instance of AssetFinder should be
+        # consistent throughout its lifetime, so we grab a reference
+        # to our cache now. That way, if the cache is refreshed later,
+        # our instance will continue to use the old one.
+        self.cache = {}
+        self.sym_cache = {}
+        self.identifier_cache = {}
+        self.fuzzy_match = {}
+
+        # The AssetFinder also holds a nested-dict of all metadata for
+        # reference when building Assets
+        self.metadata_cache = {}
+        if metadata:
+            self.consume_metadata(metadata)
+
+        self.trading_calendar = trading_calendar
+        self.populate_cache()
+
+    def _next_free_sid(self):
+        if len(self.cache) > 0:
+            return max(self.cache.keys()) + 1
+        return 0
+
+    def _assign_sid(self, identifier):
+        if hasattr(identifier, '__int__'):
+            return identifier.__int__()
+        if isinstance(identifier, string_types):
+            return self._next_free_sid()
+
+    def retrieve_asset(self, sid, default_none=False):
+        if isinstance(sid, Asset):
+            return sid
+        asset = self.cache.get(sid)
+        if asset is not None:
+            return asset
+        elif default_none:
+            return None
+        else:
+            raise SidNotFound(sid=sid)
+
+    def retrieve_asset_by_identifier(self, identifier):
+        if isinstance(identifier, Asset):
+            return identifier
+        asset = self.identifier_cache.get(identifier)
+        if asset is not None:
+            return asset
+        else:
+            raise IdentifierNotFound(identifier=identifier)
+
+    @staticmethod
+    def _lookup_symbol_in_infos(infos, as_of_date):
+        """
+        Search a list of symbols matching a given asset for the most recent
+        known symbol as of as_of_date.
+
+        Returns a pair of (Asset, bool), representing the best match we
+        found for as_of_date, and whether or not that match was actually
+        trading at as_of_date.
+
+        If no entry in infos started before as_of_date, return (None, False).
+        """
+        # Sort entries by end_date before iterating.  If asset start and end
+        # dates were always disjoint, then we could sort by either start or
+        # end_date and get the same sorting.
+        infos = sorted(infos, key=operator.attrgetter('end_date'))
+
+        # Find the newest asset that started before as_of_date.
+        candidates = [i for i in infos
+                      if (i.start_date is None or i.start_date <= as_of_date)
+                      and (i.end_date is None or as_of_date <= i.end_date)]
+
+        # If one SID exists for symbol, return that symbol
+        if len(candidates) == 1:
+            return candidates[0], True
+
+        # If no SID exists for symbol, return SID with the
+        # highest-but-not-over end_date
+        if len(candidates) == 0:
+            candidates = [i for i in infos
+                          if i.end_date < as_of_date]
+            return (candidates[-1], False) if candidates else (None, False)
+
+        # If multiple SIDs exist for symbol, return latest start_date with
+        # end_date as a tie-breaker
+        if len(candidates) > 1:
+            best_candidate = sorted(
+                candidates,
+                key=lambda x: (x.start_date, x.end_date)
+            )[-1]
+            return best_candidate, True
+
+    def lookup_symbol_resolve_multiple(self, symbol, as_of_date=None):
+        """
+        Return matching Asset of name symbol in database.
+
+        If multiple Assets are found and as_of_date is not set,
+        raises MultipleSymbolsFound.
+
+        If no Asset was active at as_of_date, and allow_expired is False
+        raises SymbolNotFound.
+        """
+        if as_of_date is not None:
+            as_of_date = normalize_date(as_of_date)
+
+        if symbol not in self.sym_cache:
+            raise SymbolNotFound(symbol=symbol)
+
+        infos = self.sym_cache[symbol]
+        if as_of_date is None:
+            if len(infos) == 1:
+                return infos[0]
+            else:
+                raise MultipleSymbolsFound(symbol=symbol,
+                                           options=str(infos))
+
+        # Try to find symbol matching as_of_date
+        asset, _ = self._lookup_symbol_in_infos(infos, as_of_date)
+        if asset is None:
+            raise SymbolNotFound(symbol=symbol)
+        return asset
+
+    def lookup_symbol(self, symbol, as_of_date, fuzzy=None):
+        """
+        If a fuzzy string is provided, then we try various symbols based on
+        the provided symbol.  This is to facilitate mapping from a broker's
+        symbol to ours in cases where mapping to the broker's symbol loses
+        information. For example, if we have CMCS_A, but a broker has CMCSA,
+        when the broker provides CMCSA, it can also provide fuzzy='_',
+        so we can find a match by inserting an underscore.
+        """
+        symbol = symbol.upper()
+        as_of_date = normalize_date(as_of_date)
+
+        if not fuzzy:
+            try:
+                return self.lookup_symbol_resolve_multiple(symbol, as_of_date)
+            except SymbolNotFound:
+                return None
+        else:
+            try:
+                return self.fuzzy_match[(symbol, fuzzy, as_of_date)]
+            except KeyError:
+                # if symbol is CMCSA and fuzzy is '_', then
+                # try CMCSA, then CMCS_A, then CMC_SA, etc.
+                for fuzzy_symbol in chain(
+                        (symbol,),
+                        (symbol[:i] + fuzzy + symbol[i:]
+                         for i in range(len(symbol) - 1, 0, -1))):
+
+                    infos = self.sym_cache.get(fuzzy_symbol)
+                    if infos:
+                        info, date_match = self._lookup_symbol_in_infos(
+                            infos,
+                            as_of_date,
+                        )
+
+                        if info is not None and date_match:
+                            self.fuzzy_match[(symbol, fuzzy, as_of_date)] = \
+                                info
+                            return info
+                else:
+                    self.fuzzy_match[(symbol, fuzzy, as_of_date)] = None
+
+    def populate_cache(self):
+        """
+        Populates the asset cache with all values in the assets
+        collection.
+        """
+
+        # Wipe caches before repopulating
+        self.cache = {}
+        self.sym_cache = {}
+        self.identifier_cache = {}
+        self.fuzzy_match = {}
+
+        counter = 0
+        for identifier, row in self.metadata_cache.items():
+            self.spawn_asset(identifier=identifier, **row)
+            counter += 1
+
+    def spawn_asset(self, identifier, **kwargs):
+
+        # Check if the sid is declared
+        try:
+            kwargs['sid']
+            pass
+        except KeyError:
+            # Assign the identifier as the sid, if applicable
+            if isinstance(identifier, int):
+                kwargs['sid'] = identifier
+            # If the identifier is not a sid, assign one
+            else:
+                kwargs['sid'] = self._assign_sid(identifier)
+                # Update the metadata object with the new sid
+                self.insert_metadata(identifier=identifier, sid=kwargs['sid'])
+
+        # If the file_name is in the kwargs, it may be the symbol
+        try:
+            file_name = kwargs.pop('file_name')
+            try:
+                kwargs['symbol']
+            except KeyError:
+                kwargs['symbol'] = file_name
+        except KeyError:
+            pass
+
+        # If the identifier coming in was a string and there is no defined
+        # symbol yet, set the symbol to the incoming identifier
+        try:
+            kwargs['symbol']
+            pass
+        except KeyError:
+            if isinstance(identifier, string_types):
+                kwargs['symbol'] = identifier
+
+        # If the company_name is in the kwargs, it may be the asset_name
+        try:
+            company_name = kwargs.pop('company_name')
+            try:
+                kwargs['asset_name']
+            except KeyError:
+                kwargs['asset_name'] = company_name
+        except KeyError:
+            pass
+
+        # If dates are given as nanos, pop them
+        try:
+            kwargs['start_date'] = kwargs.pop('start_date_nano')
+        except KeyError:
+            pass
+        try:
+            kwargs['end_date'] = kwargs.pop('end_date_nano')
+        except KeyError:
+            pass
+        try:
+            kwargs['notice_date'] = kwargs.pop('notice_date_nano')
+        except KeyError:
+            pass
+        try:
+            kwargs['expiration_date'] = kwargs.pop('expiration_date_nano')
+        except KeyError:
+            pass
+
+        # Process dates to Timestamps
+        try:
+            kwargs['start_date'] = pd.Timestamp(kwargs['start_date'], tz='UTC')
+        except KeyError:
+            pass
+        try:
+            kwargs['end_date'] = pd.Timestamp(kwargs['end_date'], tz='UTC')
+        except KeyError:
+            pass
+        try:
+            kwargs['notice_date'] = pd.Timestamp(kwargs['notice_date'],
+                                                 tz='UTC')
+        except KeyError:
+            pass
+        try:
+            kwargs['expiration_date'] = pd.Timestamp(kwargs['expiration_date'],
+                                                     tz='UTC')
+        except KeyError:
+            pass
+
+        # Build an Asset of the appropriate type
+        asset_type = kwargs.pop('asset_type', 'equity')
+        if asset_type.lower() == 'equity':
+            asset = Equity(**kwargs)
+        elif asset_type.lower() == 'future':
+            asset = Future(**kwargs)
+        else:
+            raise InvalidAssetType(asset_type=asset_type)
+
+        self.cache[asset.sid] = asset
+        self.identifier_cache[identifier] = asset
+        if asset.symbol is not "":
+            self.sym_cache.setdefault(asset.symbol, []).append(asset)
+
+        return asset
+
+    @property
+    def sids(self):
+        return self.cache.keys()
+
+    @property
+    def assets(self):
+        return self.cache.values()
+
+    def _lookup_generic_scalar(self,
+                               asset_convertible,
+                               as_of_date,
+                               matches,
+                               missing):
+        """
+        Convert asset_convertible to an asset.
+
+        On success, append to matches.
+        On failure, append to missing.
+        """
+        try:
+            if isinstance(asset_convertible, Asset):
+                matches.append(asset_convertible)
+
+            elif isinstance(asset_convertible, Integral):
+                result = self.retrieve_asset(int(asset_convertible))
+                if result is None:
+                    raise SymbolNotFound(symbol=asset_convertible)
+                matches.append(result)
+
+            elif isinstance(asset_convertible, string_types):
+                # Throws SymbolNotFound on failure to match.
+                matches.append(
+                    self.lookup_symbol_resolve_multiple(
+                        asset_convertible,
+                        as_of_date,
+                    )
+                )
+            else:
+                raise NotAssetConvertible(
+                    "Input was %s, not AssetConvertible."
+                    % asset_convertible
+                )
+
+        except SymbolNotFound:
+            missing.append(asset_convertible)
+            return None
+
+    def lookup_generic(self,
+                       asset_convertible_or_iterable,
+                       as_of_date):
+        """
+        Convert a AssetConvertible or iterable of AssetConvertibles into
+        a list of Asset objects.
+
+        This method exists primarily as a convenience for implementing
+        user-facing APIs that can handle multiple kinds of input.  It should
+        not be used for internal code where we already know the expected types
+        of our inputs.
+
+        Returns a pair of objects, the first of which is the result of the
+        conversion, and the second of which is a list containing any values
+        that couldn't be resolved.
+        """
+        matches = []
+        missing = []
+
+        # Interpret input as scalar.
+        if isinstance(asset_convertible_or_iterable, AssetConvertible):
+            self._lookup_generic_scalar(
+                asset_convertible=asset_convertible_or_iterable,
+                as_of_date=as_of_date,
+                matches=matches,
+                missing=missing,
+            )
+            try:
+                return matches[0], missing
+            except IndexError:
+                if hasattr(asset_convertible_or_iterable, '__int__'):
+                    raise SidNotFound(sid=asset_convertible_or_iterable)
+                else:
+                    raise SymbolNotFound(symbol=asset_convertible_or_iterable)
+
+        # Interpret input as iterable.
+        try:
+            iterator = iter(asset_convertible_or_iterable)
+        except TypeError:
+            raise NotAssetConvertible(
+                "Input was not a AssetConvertible "
+                "or iterable of AssetConvertible."
+            )
+
+        for obj in iterator:
+            self._lookup_generic_scalar(obj, as_of_date, matches, missing)
+        return matches, missing
+
+    def insert_metadata(self, identifier, **kwargs):
+        """
+        Inserts the given metadata kwargs to the entry for the given
+        identifier. Matching fields in the existing entry will be overwritten.
+        :param identifier: The identifier for which to insert metadata
+        :param kwargs: The keyed metadata to insert
+        """
+        entry = self.metadata_cache.get(identifier, {})
+
+        for key, value in kwargs.items():
+            # Do not accept invalid fields
+            if key not in ASSET_FIELDS:
+                continue
+            # Do not accept Nones
+            if value is None:
+                continue
+            # Do not accept nans from dataframes
+            if isinstance(value, float) and np.isnan(value):
+                continue
+            entry[key] = value
+
+        self.metadata_cache[identifier] = entry
+
+    def consume_identifiers(self, identifiers):
+        for identifier in identifiers:
+            self.insert_metadata(identifier)
+
+    def consume_metadata(self, metadata):
+        """
+        Consumes the provided metadata in to the metadata cache. The
+        existing values in the cache will be overwritten when there
+        is a conflict.
+        :param metadata: The metadata to be consumed
+        """
+        # Handle dicts
+        if isinstance(metadata, dict):
+            self._insert_metadata_dict(metadata)
+        # Handle DataFrames
+        elif isinstance(metadata, pd.DataFrame):
+            self._insert_metadata_dataframe(metadata)
+        # Handle readables
+        elif hasattr(metadata, 'read'):
+            self._insert_metadata_readable(metadata)
+        else:
+            raise ConsumeAssetMetaDataError(obj=metadata)
+
+    def clear_metadata(self):
+        self.metadata_cache = {}
+
+    def _insert_metadata_dataframe(self, dataframe):
+        for identifier, row in dataframe.iterrows():
+            self.insert_metadata(identifier, **row)
+
+    def _insert_metadata_dict(self, dict):
+        for identifier, entry in dict.items():
+            self.insert_metadata(identifier, **entry)
+
+    def _insert_metadata_readable(self, readable):
+        for row in readable.read():
+            # Parse out the row of the readable object
+            metadata_dict = {}
+            for field in ASSET_FIELDS:
+                try:
+                    row_value = row[field]
+                    # Avoid passing placeholders
+                    if row_value and (row_value is not 'None'):
+                        metadata_dict[field] = row[field]
+                except KeyError:
+                    continue
+                except IndexError:
+                    continue
+            # Locate the identifier, fail if not found
+            if 'sid' in metadata_dict:
+                identifier = metadata_dict['sid']
+            elif 'symbol' in metadata_dict:
+                identifier = metadata_dict['symbol']
+            else:
+                raise ConsumeAssetMetaDataError(obj=row)
+            self.insert_metadata(identifier, **metadata_dict)
+
+
+class AssetConvertible(with_metaclass(ABCMeta)):
+    """
+    ABC for types that are convertible to integer-representations of
+    Assets.
+
+    Includes Asset, six.string_types, and Integral
+    """
+    pass
+
+AssetConvertible.register(Integral)
+AssetConvertible.register(Asset)
+# Use six.string_types for Python2/3 compatibility
+for type in string_types:
+    AssetConvertible.register(type)
+
+
+class NotAssetConvertible(ValueError):
+    pass