From 780263da06b0443db8b6934864057c102912d212 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Wed, 26 Aug 2015 17:32:44 -0400 Subject: [PATCH] ENH: Return asset-indexed DataFrame for data.factors. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes ordering with the returned assets much easier, and there's no performance degradation for non-broadcasting operations on the Index. Timings ------- from random import sample finder = AssetFinder(create_table=False, assets.db') assets = load_8000_assets(finder) AAPL = finder.retrieve_asset(24) RANDOM_ASSETS = sample(assets, 500) df = DataFrame( index=assets, data=np.random.randn(len(assets), 4), columns=['a', 'b', 'c', 'd'], ) df_int = DataFrame( index=map(int, assets), data=np.random.randn(len(assets), 4), columns=['a', 'b', 'c', 'd'], ) %timeit df.loc[24] %timeit df_int.loc[24] 10000 loops, best of 3: 45.3 µs per loop 10000 loops, best of 3: 44.7 µs per loop %timeit df.loc[AAPL] %timeit df_int.loc[AAPL] 10000 loops, best of 3: 45.1 µs per loop 10000 loops, best of 3: 44.8 µs per loop %timeit df.loc[RANDOM_ASSETS] %timeit df_int.loc[RANDOM_ASSETS] 1000 loops, best of 3: 1.53 ms per loop 100 loops, best of 3: 2.18 ms per loop %timeit df.sum() %timeit df_int.sum() 10000 loops, best of 3: 56 µs per loop 10000 loops, best of 3: 55.7 µs per loop %timeit df.index == 3 %timeit df_int.index == 3 1000 loops, best of 3: 253 µs per loop 100000 loops, best of 3: 6.76 µs per loop %timeit df.iloc[:50] %timeit df_int.iloc[:50] 10000 loops, best of 3: 44.3 µs per loop 10000 loops, best of 3: 44 µs per loop --- tests/modelling/test_engine.py | 21 ++++++++++++++++++--- zipline/assets/assets.py | 3 +++ zipline/modelling/engine.py | 8 +++++++- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/tests/modelling/test_engine.py b/tests/modelling/test_engine.py index c647b50f..6cfcd05c 100644 --- a/tests/modelling/test_engine.py +++ b/tests/modelling/test_engine.py @@ -3,6 +3,7 @@ Tests for SimpleFFCEngine """ from __future__ import division from unittest import TestCase +from itertools import product from numpy import ( full, @@ -14,9 +15,10 @@ from pandas import ( DataFrame, date_range, Int64Index, + MultiIndex, rolling_mean, - Timestamp, Series, + Timestamp, ) from pandas.util.testing import assert_frame_equal from testfixtures import TempDirectory @@ -61,6 +63,12 @@ class RollingSumDifference(TestingFactor): return (open - close).sum(axis=0) +def assert_product(case, index, *levels): + """Assert that a MultiIndex contains the product of `*levels`.""" + case.assertIsInstance(index, MultiIndex, "%s is not a MultiIndex" % index) + case.assertEqual(set(index), set(product(*levels))) + + class ConstantInputTestCase(TestCase): def setUp(self): @@ -99,14 +107,17 @@ class ConstantInputTestCase(TestCase): def test_single_factor(self): loader = self.loader + finder = self.asset_finder + assets = self.assets engine = SimpleFFCEngine(loader, self.dates, self.asset_finder) - result_shape = (num_dates, num_assets) = (5, len(self.assets)) + result_shape = (num_dates, num_assets) = (5, len(assets)) dates = self.dates[10:10 + num_dates] factor = RollingSumDifference() result = engine.factor_matrix({'f': factor}, dates[0], dates[-1]) self.assertEqual(set(result.columns), {'f'}) + assert_product(self, result.index, dates, finder.retrieve_all(assets)) assert_array_equal( result['f'].unstack().values, @@ -116,8 +127,10 @@ class ConstantInputTestCase(TestCase): def test_multiple_rolling_factors(self): loader = self.loader + finder = self.asset_finder + assets = self.assets engine = SimpleFFCEngine(loader, self.dates, self.asset_finder) - shape = num_dates, num_assets = (5, len(self.assets)) + shape = num_dates, num_assets = (5, len(assets)) dates = self.dates[10:10 + num_dates] short_factor = RollingSumDifference(window_length=3) @@ -133,6 +146,7 @@ class ConstantInputTestCase(TestCase): dates[-1], ) self.assertEqual(set(results.columns), {'short', 'high', 'long'}) + assert_product(self, results.index, dates, finder.retrieve_all(assets)) # row-wise sum over an array whose values are all (1 - 2) assert_array_equal( @@ -467,6 +481,7 @@ class MultiColumnLoaderTestCase(TestCase): self.dates[-1]) self.assertIsNotNone(result) self.assertEqual({'f'}, set(result.columns)) + # (close - open) * window = (1 - 2) * 3 = -3 # skipped 2 from the start, so that the window is full check_arrays(result['f'], diff --git a/zipline/assets/assets.py b/zipline/assets/assets.py index c012996d..9cc52a1b 100644 --- a/zipline/assets/assets.py +++ b/zipline/assets/assets.py @@ -242,6 +242,9 @@ class AssetFinder(object): else: raise SidNotFound(sid=sid) + def retrieve_all(self, sids, default_none=False): + return [self.retrieve_asset(sid) for sid in sids] + def _retrieve_equity(self, sid): try: return self._equity_cache[sid] diff --git a/zipline/modelling/engine.py b/zipline/modelling/engine.py index 911e5bc3..06f98e5d 100644 --- a/zipline/modelling/engine.py +++ b/zipline/modelling/engine.py @@ -482,6 +482,12 @@ class SimpleFFCEngine(object): return DataFrame( dict(zip(factor_names, factor_outputs)), index=MultiIndex.from_arrays( - [raw_dates_index, raw_assets_index], + [ + raw_dates_index, + # FUTURE OPTIMIZATION: + # Avoid duplicate lookups by grouping and only looking up + # each unique sid once. + self._finder.retrieve_all(raw_assets_index), + ], ) ).tz_localize('UTC', level=0)