mirror of
https://github.com/wassname/catalyst.git
synced 2026-06-30 01:04:13 +08:00
ENH: Return asset-indexed DataFrame for data.factors.
This makes ordering with the returned assets much easier, and there's no
performance degradation for non-broadcasting operations on the Index.
Timings
-------
from random import sample
finder = AssetFinder(create_table=False, assets.db')
assets = load_8000_assets(finder)
AAPL = finder.retrieve_asset(24)
RANDOM_ASSETS = sample(assets, 500)
df = DataFrame(
index=assets,
data=np.random.randn(len(assets), 4),
columns=['a', 'b', 'c', 'd'],
)
df_int = DataFrame(
index=map(int, assets),
data=np.random.randn(len(assets), 4),
columns=['a', 'b', 'c', 'd'],
)
%timeit df.loc[24]
%timeit df_int.loc[24]
10000 loops, best of 3: 45.3 µs per loop
10000 loops, best of 3: 44.7 µs per loop
%timeit df.loc[AAPL]
%timeit df_int.loc[AAPL]
10000 loops, best of 3: 45.1 µs per loop
10000 loops, best of 3: 44.8 µs per loop
%timeit df.loc[RANDOM_ASSETS]
%timeit df_int.loc[RANDOM_ASSETS]
1000 loops, best of 3: 1.53 ms per loop
100 loops, best of 3: 2.18 ms per loop
%timeit df.sum()
%timeit df_int.sum()
10000 loops, best of 3: 56 µs per loop
10000 loops, best of 3: 55.7 µs per loop
%timeit df.index == 3
%timeit df_int.index == 3
1000 loops, best of 3: 253 µs per loop
100000 loops, best of 3: 6.76 µs per loop
%timeit df.iloc[:50]
%timeit df_int.iloc[:50]
10000 loops, best of 3: 44.3 µs per loop
10000 loops, best of 3: 44 µs per loop
This commit is contained in:
@@ -3,6 +3,7 @@ Tests for SimpleFFCEngine
|
||||
"""
|
||||
from __future__ import division
|
||||
from unittest import TestCase
|
||||
from itertools import product
|
||||
|
||||
from numpy import (
|
||||
full,
|
||||
@@ -14,9 +15,10 @@ from pandas import (
|
||||
DataFrame,
|
||||
date_range,
|
||||
Int64Index,
|
||||
MultiIndex,
|
||||
rolling_mean,
|
||||
Timestamp,
|
||||
Series,
|
||||
Timestamp,
|
||||
)
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
from testfixtures import TempDirectory
|
||||
@@ -61,6 +63,12 @@ class RollingSumDifference(TestingFactor):
|
||||
return (open - close).sum(axis=0)
|
||||
|
||||
|
||||
def assert_product(case, index, *levels):
|
||||
"""Assert that a MultiIndex contains the product of `*levels`."""
|
||||
case.assertIsInstance(index, MultiIndex, "%s is not a MultiIndex" % index)
|
||||
case.assertEqual(set(index), set(product(*levels)))
|
||||
|
||||
|
||||
class ConstantInputTestCase(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
@@ -99,14 +107,17 @@ class ConstantInputTestCase(TestCase):
|
||||
|
||||
def test_single_factor(self):
|
||||
loader = self.loader
|
||||
finder = self.asset_finder
|
||||
assets = self.assets
|
||||
engine = SimpleFFCEngine(loader, self.dates, self.asset_finder)
|
||||
result_shape = (num_dates, num_assets) = (5, len(self.assets))
|
||||
result_shape = (num_dates, num_assets) = (5, len(assets))
|
||||
dates = self.dates[10:10 + num_dates]
|
||||
|
||||
factor = RollingSumDifference()
|
||||
|
||||
result = engine.factor_matrix({'f': factor}, dates[0], dates[-1])
|
||||
self.assertEqual(set(result.columns), {'f'})
|
||||
assert_product(self, result.index, dates, finder.retrieve_all(assets))
|
||||
|
||||
assert_array_equal(
|
||||
result['f'].unstack().values,
|
||||
@@ -116,8 +127,10 @@ class ConstantInputTestCase(TestCase):
|
||||
def test_multiple_rolling_factors(self):
|
||||
|
||||
loader = self.loader
|
||||
finder = self.asset_finder
|
||||
assets = self.assets
|
||||
engine = SimpleFFCEngine(loader, self.dates, self.asset_finder)
|
||||
shape = num_dates, num_assets = (5, len(self.assets))
|
||||
shape = num_dates, num_assets = (5, len(assets))
|
||||
dates = self.dates[10:10 + num_dates]
|
||||
|
||||
short_factor = RollingSumDifference(window_length=3)
|
||||
@@ -133,6 +146,7 @@ class ConstantInputTestCase(TestCase):
|
||||
dates[-1],
|
||||
)
|
||||
self.assertEqual(set(results.columns), {'short', 'high', 'long'})
|
||||
assert_product(self, results.index, dates, finder.retrieve_all(assets))
|
||||
|
||||
# row-wise sum over an array whose values are all (1 - 2)
|
||||
assert_array_equal(
|
||||
@@ -467,6 +481,7 @@ class MultiColumnLoaderTestCase(TestCase):
|
||||
self.dates[-1])
|
||||
self.assertIsNotNone(result)
|
||||
self.assertEqual({'f'}, set(result.columns))
|
||||
|
||||
# (close - open) * window = (1 - 2) * 3 = -3
|
||||
# skipped 2 from the start, so that the window is full
|
||||
check_arrays(result['f'],
|
||||
|
||||
@@ -242,6 +242,9 @@ class AssetFinder(object):
|
||||
else:
|
||||
raise SidNotFound(sid=sid)
|
||||
|
||||
def retrieve_all(self, sids, default_none=False):
|
||||
return [self.retrieve_asset(sid) for sid in sids]
|
||||
|
||||
def _retrieve_equity(self, sid):
|
||||
try:
|
||||
return self._equity_cache[sid]
|
||||
|
||||
@@ -482,6 +482,12 @@ class SimpleFFCEngine(object):
|
||||
return DataFrame(
|
||||
dict(zip(factor_names, factor_outputs)),
|
||||
index=MultiIndex.from_arrays(
|
||||
[raw_dates_index, raw_assets_index],
|
||||
[
|
||||
raw_dates_index,
|
||||
# FUTURE OPTIMIZATION:
|
||||
# Avoid duplicate lookups by grouping and only looking up
|
||||
# each unique sid once.
|
||||
self._finder.retrieve_all(raw_assets_index),
|
||||
],
|
||||
)
|
||||
).tz_localize('UTC', level=0)
|
||||
|
||||
Reference in New Issue
Block a user