ENH: Return asset-indexed DataFrame for data.factors.

This makes ordering with the returned assets much easier, and there's no
performance degradation for non-broadcasting operations on the Index.

Timings
-------

    from random import sample
    finder = AssetFinder(create_table=False, assets.db')
    assets = load_8000_assets(finder)
    AAPL = finder.retrieve_asset(24)
    RANDOM_ASSETS = sample(assets, 500)
    df = DataFrame(
        index=assets,
        data=np.random.randn(len(assets), 4),
        columns=['a', 'b', 'c', 'd'],
    )
    df_int = DataFrame(
        index=map(int, assets),
        data=np.random.randn(len(assets), 4),
        columns=['a', 'b', 'c', 'd'],
    )

    %timeit df.loc[24]
    %timeit df_int.loc[24]

    10000 loops, best of 3: 45.3 µs per loop
    10000 loops, best of 3: 44.7 µs per loop

    %timeit df.loc[AAPL]
    %timeit df_int.loc[AAPL]

    10000 loops, best of 3: 45.1 µs per loop
    10000 loops, best of 3: 44.8 µs per loop

    %timeit df.loc[RANDOM_ASSETS]
    %timeit df_int.loc[RANDOM_ASSETS]

    1000 loops, best of 3: 1.53 ms per loop
    100 loops, best of 3: 2.18 ms per loop

    %timeit df.sum()
    %timeit df_int.sum()

    10000 loops, best of 3: 56 µs per loop
    10000 loops, best of 3: 55.7 µs per loop

    %timeit df.index == 3
    %timeit df_int.index == 3

    1000 loops, best of 3: 253 µs per loop
    100000 loops, best of 3: 6.76 µs per loop

    %timeit df.iloc[:50]
    %timeit df_int.iloc[:50]

    10000 loops, best of 3: 44.3 µs per loop
    10000 loops, best of 3: 44 µs per loop
This commit is contained in:
Scott Sanderson
2015-08-26 17:32:44 -04:00
parent c6aa36e5c1
commit 780263da06
3 changed files with 28 additions and 4 deletions
+18 -3
View File
@@ -3,6 +3,7 @@ Tests for SimpleFFCEngine
"""
from __future__ import division
from unittest import TestCase
from itertools import product
from numpy import (
full,
@@ -14,9 +15,10 @@ from pandas import (
DataFrame,
date_range,
Int64Index,
MultiIndex,
rolling_mean,
Timestamp,
Series,
Timestamp,
)
from pandas.util.testing import assert_frame_equal
from testfixtures import TempDirectory
@@ -61,6 +63,12 @@ class RollingSumDifference(TestingFactor):
return (open - close).sum(axis=0)
def assert_product(case, index, *levels):
"""Assert that a MultiIndex contains the product of `*levels`."""
case.assertIsInstance(index, MultiIndex, "%s is not a MultiIndex" % index)
case.assertEqual(set(index), set(product(*levels)))
class ConstantInputTestCase(TestCase):
def setUp(self):
@@ -99,14 +107,17 @@ class ConstantInputTestCase(TestCase):
def test_single_factor(self):
loader = self.loader
finder = self.asset_finder
assets = self.assets
engine = SimpleFFCEngine(loader, self.dates, self.asset_finder)
result_shape = (num_dates, num_assets) = (5, len(self.assets))
result_shape = (num_dates, num_assets) = (5, len(assets))
dates = self.dates[10:10 + num_dates]
factor = RollingSumDifference()
result = engine.factor_matrix({'f': factor}, dates[0], dates[-1])
self.assertEqual(set(result.columns), {'f'})
assert_product(self, result.index, dates, finder.retrieve_all(assets))
assert_array_equal(
result['f'].unstack().values,
@@ -116,8 +127,10 @@ class ConstantInputTestCase(TestCase):
def test_multiple_rolling_factors(self):
loader = self.loader
finder = self.asset_finder
assets = self.assets
engine = SimpleFFCEngine(loader, self.dates, self.asset_finder)
shape = num_dates, num_assets = (5, len(self.assets))
shape = num_dates, num_assets = (5, len(assets))
dates = self.dates[10:10 + num_dates]
short_factor = RollingSumDifference(window_length=3)
@@ -133,6 +146,7 @@ class ConstantInputTestCase(TestCase):
dates[-1],
)
self.assertEqual(set(results.columns), {'short', 'high', 'long'})
assert_product(self, results.index, dates, finder.retrieve_all(assets))
# row-wise sum over an array whose values are all (1 - 2)
assert_array_equal(
@@ -467,6 +481,7 @@ class MultiColumnLoaderTestCase(TestCase):
self.dates[-1])
self.assertIsNotNone(result)
self.assertEqual({'f'}, set(result.columns))
# (close - open) * window = (1 - 2) * 3 = -3
# skipped 2 from the start, so that the window is full
check_arrays(result['f'],
+3
View File
@@ -242,6 +242,9 @@ class AssetFinder(object):
else:
raise SidNotFound(sid=sid)
def retrieve_all(self, sids, default_none=False):
return [self.retrieve_asset(sid) for sid in sids]
def _retrieve_equity(self, sid):
try:
return self._equity_cache[sid]
+7 -1
View File
@@ -482,6 +482,12 @@ class SimpleFFCEngine(object):
return DataFrame(
dict(zip(factor_names, factor_outputs)),
index=MultiIndex.from_arrays(
[raw_dates_index, raw_assets_index],
[
raw_dates_index,
# FUTURE OPTIMIZATION:
# Avoid duplicate lookups by grouping and only looking up
# each unique sid once.
self._finder.retrieve_all(raw_assets_index),
],
)
).tz_localize('UTC', level=0)