Merge pull request #1339 from quantopian/latest-numpy-pandas

Latest numpy and pandas
This commit is contained in:
Scott Sanderson
2016-09-20 22:03:49 -04:00
committed by GitHub
99 changed files with 393 additions and 260 deletions
+3
View File
@@ -55,6 +55,9 @@ nosetests.xml
# Built documentation
docs/_build/*
# Un-tarred example data input. We should only commit the tarball.
tests/resources/example_data/*
# database of vbench
benchmarks.db
+2 -6
View File
@@ -9,8 +9,7 @@ env:
# ANACONDA_TOKEN with api:write
- secure: "RJJjjQloUjCSkhII93QM+YAsr6YYq7hPFvlbnT07ogn1NeUPsVCyJ97oiZfKtPgdbd24hdQP/CHfB0HgTTES8n996tN3QWc4hZj0e10kFyIlas9qnkrRYRR1jxGShBDXrLdx/tGh8z1qnnCm1fy+fDhAF7Zerouwy4EA2YEzxdE="
matrix:
- PANDAS_VERSION=0.16.1 NUMPY_VERSION=1.9.2 SCIPY_VERSION=0.15.1
- PANDAS_VERSION=0.17.1 NUMPY_VERSION=1.10.4 SCIPY_VERSION=0.16.1
- NUMPY_VERSION=1.11.1 SCIPY_VERSION=0.17.1
cache:
directories:
- $HOME/.cache/.pip/
@@ -20,14 +19,11 @@ before_install:
- chmod +x miniconda.sh
- ./miniconda.sh -b -p $HOME/miniconda
- export PATH="$HOME/miniconda/bin:$PATH"
- sed -i "s/numpy==.*/numpy==$NUMPY_VERSION/" etc/requirements.txt
- sed -i "s/pandas==.*/pandas==$PANDAS_VERSION/" etc/requirements.txt
- sed -i "s/scipy==.*/scipy==$SCIPY_VERSION/" etc/requirements.txt
install:
- conda install conda=4.1.11 conda-build=1.21.11 anaconda-client=1.5.1 --yes
- TALIB_VERSION=$(cat ./etc/requirements_talib.txt | sed "s/TA-Lib==\(.*\)/\1/")
- conda create -n testenv --yes -c quantopian pip python=$TRAVIS_PYTHON_VERSION numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION libgfortran=1.0 ta-lib=$TALIB_VERSION
- conda create -n testenv --yes -c quantopian pip python=$TRAVIS_PYTHON_VERSION numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION libgfortran=3.0 ta-lib=$TALIB_VERSION
- source activate testenv
- IFS='.' read -r -a NPY_VERSION_ARR <<< "$NUMPY_VERSION"
- CONDA_NPY=${NPY_VERSION_ARR[0]}${NPY_VERSION_ARR[1]}
+6 -18
View File
@@ -15,27 +15,15 @@ environment:
matrix:
- PYTHON_VERSION: "2.7"
PYTHON_ARCH: "64"
PANDAS_VERSION: "0.16.1"
NUMPY_VERSION: "1.9.2"
SCIPY_VERSION: "0.15.1"
PANDAS_VERSION: "0.18.1"
NUMPY_VERSION: "1.11.1"
SCIPY_VERSION: "0.17.1"
- PYTHON_VERSION: "3.4"
PYTHON_ARCH: "64"
PANDAS_VERSION: "0.16.1"
NUMPY_VERSION: "1.9.2"
SCIPY_VERSION: "0.15.1"
- PYTHON_VERSION: "2.7"
PYTHON_ARCH: "64"
PANDAS_VERSION: "0.17.1"
NUMPY_VERSION: "1.10.4"
SCIPY_VERSION: "0.16.1"
- PYTHON_VERSION: "3.4"
PYTHON_ARCH: "64"
PANDAS_VERSION: "0.17.1"
NUMPY_VERSION: "1.10.4"
SCIPY_VERSION: "0.16.1"
PANDAS_VERSION: "0.18.1"
NUMPY_VERSION: "1.11.1"
SCIPY_VERSION: "0.17.1"
# We always use a 64-bit machine, but can build x86 distributions
# with the PYTHON_ARCH variable (which is used by CMD_IN_ENV).
+5 -4
View File
@@ -8,13 +8,13 @@ Logbook==0.12.5
# Scientific Libraries
pytz==2015.4
numpy==1.9.2
pytz==2016.4
numpy==1.11.1
# scipy and pandas are required for statsmodels,
# statsmodels in turn is required for some pandas packages
scipy==0.15.1
pandas==0.16.1
scipy==0.17.1
pandas==0.18.1
pandas-datareader==0.2.1
# Needed for parts of pandas.stats
patsy==0.4.0
@@ -38,6 +38,7 @@ contextlib2==0.4.0
# networkx requires decorator
decorator==4.0.0
# Graph algorithms used by zipline.pipeline
networkx==1.9.1
+2 -2
View File
@@ -1,3 +1,3 @@
-e git://github.com/quantopian/datashape.git@bf06a41dc0908baf7c324aeacadba8820468ee78#egg=datashape-dev
-e git://github.com/quantopian/odo.git@9e16310b5f2c3f05162145200db7e7908f0a866e#egg=odo-dev
-e git://github.com/quantopian/blaze.git@7b8f70c15279c7ebacf2ac2f5f74acf08b300d71#egg=blaze-dev
-e git://github.com/quantopian/odo.git@da7f26d87702f5d293763e8ed54c7e25fd3af386#egg=odo-dev
-e git://github.com/quantopian/blaze.git@929afbd3466df7efa0cf720b74dba223e0b3ac4a#egg=blaze-dev
+1 -1
View File
@@ -148,7 +148,7 @@ def _filter_requirements(lines_iter, filter_names=None,
REQ_UPPER_BOUNDS = {
'bcolz': '<1',
'pandas': '<0.18',
'pandas': '<0.19',
}
+2 -2
View File
@@ -590,7 +590,7 @@ class BcolzMinuteBarTestCase(WithTradingCalendars,
'high': full(9, nan),
'low': full(9, nan),
'close': full(9, nan),
'volume': full(9, 0),
'volume': full(9, 0.0),
},
index=[minutes])
self.writer.write_sid(sid, data)
@@ -633,7 +633,7 @@ class BcolzMinuteBarTestCase(WithTradingCalendars,
view(float64),
'close': ((0b11111111111 << 52) + arange(31, 40, dtype=int64)).
view(float64),
'volume': full(9, 0),
'volume': full(9, 0.0),
},
index=[minutes])
self.writer.write_sid(sid, data)
+33
View File
@@ -16,6 +16,7 @@ import datetime
from inspect import isabstract
import random
from unittest import TestCase
import warnings
from nose_parameterized import parameterized
import pandas as pd
@@ -439,6 +440,38 @@ class StatelessRulesTests(RuleTestCase):
self.assertIs(composed.second, rule2)
self.assertFalse(any(map(should_trigger, minute)))
@parameterized.expand([
('month_start', NthTradingDayOfMonth),
('month_end', NDaysBeforeLastTradingDayOfMonth),
('week_start', NthTradingDayOfWeek),
('week_end', NthTradingDayOfWeek),
])
def test_pass_float_to_day_of_period_rule(self, name, rule_type):
with warnings.catch_warnings(record=True) as raised_warnings:
warnings.simplefilter('always')
rule_type(n=3) # Shouldn't trigger a warning.
rule_type(n=3.0) # Should trigger a warning about float coercion.
self.assertEqual(len(raised_warnings), 1)
# We only implicitly convert from float to int when there's no loss of
# precision.
with self.assertRaises(TypeError):
rule_type(3.1)
def test_invalid_offsets(self):
with self.assertRaises(ValueError):
NthTradingDayOfWeek(5)
with self.assertRaises(ValueError):
NthTradingDayOfWeek(-1)
with self.assertRaises(ValueError):
NthTradingDayOfMonth(-1)
with self.assertRaises(ValueError):
NthTradingDayOfMonth(24)
class StatefulRulesTests(RuleTestCase):
CALENDAR_STRING = "NYSE"
+1
View File
@@ -784,6 +784,7 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase):
expr,
loader=loader,
no_deltas_rule='ignore',
no_checkpoints_rule='ignore',
missing_values=self.missing_values,
)
+4 -1
View File
@@ -11,6 +11,7 @@ from zipline.lib.labelarray import LabelArray
from zipline.pipeline import Pipeline
from zipline.pipeline.data.testing import TestingDataSet as TDS
from zipline.testing import chrange, temp_pipeline_engine
from zipline.utils.pandas_utils import ignore_pandas_nan_categorical_warning
class LatestTestCase(TestCase):
@@ -71,6 +72,8 @@ class LatestTestCase(TestCase):
dates_to_test[-1],
)
for column in columns:
col_result = result[column.name].unstack()
with ignore_pandas_nan_categorical_warning():
col_result = result[column.name].unstack()
expected_col_result = self.expected_latest(column, cal_slice)
assert_frame_equal(col_result, expected_col_result)
+30 -18
View File
@@ -25,12 +25,8 @@ from pandas import (
Categorical,
DataFrame,
date_range,
ewma,
ewmstd,
Int64Index,
MultiIndex,
rolling_apply,
rolling_mean,
Series,
Timestamp,
)
@@ -1008,15 +1004,17 @@ class SyntheticBcolzTestCase(WithAdjustmentReader,
# Shift back the raw inputs by a trading day because we expect our
# computed results to be computed using values anchored on the
# **previous** day's data.
expected_raw = rolling_mean(
expected_raw = DataFrame(
expected_bar_values_2d(
dates - self.trading_calendar.day,
self.equity_info,
'close',
),
).rolling(
window_length,
min_periods=1,
)
).mean(
).values
expected = DataFrame(
# Truncate off the extra rows needed to compute the SMAs.
@@ -1122,19 +1120,31 @@ class ParameterizedFactorTestCase(WithTradingEnvironment, ZiplineTestCase):
def expected_ewma(self, window_length, decay_rate):
alpha = 1 - decay_rate
span = (2 / alpha) - 1
return rolling_apply(
self.raw_data,
window_length,
lambda window: ewma(window, span=span)[-1],
# XXX: This is a comically inefficient way to compute a windowed EWMA.
# Don't use it outside of testing. We're using rolling-apply of an
# ewma (which is itself a rolling-window function) because we only want
# to look at ``window_length`` rows at a time.
return self.raw_data.rolling(window_length).apply(
lambda subarray: (DataFrame(subarray)
.ewm(span=span)
.mean()
.values[-1])
)[window_length:]
def expected_ewmstd(self, window_length, decay_rate):
alpha = 1 - decay_rate
span = (2 / alpha) - 1
return rolling_apply(
self.raw_data,
window_length,
lambda window: ewmstd(window, span=span)[-1],
# XXX: This is a comically inefficient way to compute a windowed
# EWMSTD. Don't use it outside of testing. We're using rolling-apply
# of an ewma (which is itself a rolling-window function) because we
# only want to look at ``window_length`` rows at a time.
return self.raw_data.rolling(window_length).apply(
lambda subarray: (DataFrame(subarray)
.ewm(span=span)
.std()
.values[-1])
)[window_length:]
@parameterized.expand([
@@ -1259,7 +1269,7 @@ class ParameterizedFactorTestCase(WithTradingEnvironment, ZiplineTestCase):
expected_1 = (self.raw_data[5:] ** 2) * 2
assert_frame_equal(results['dv1'].unstack(), expected_1)
expected_5 = rolling_mean((self.raw_data ** 2) * 2, window=5)[5:]
expected_5 = ((self.raw_data ** 2) * 2).rolling(5).mean()[5:]
assert_frame_equal(results['dv5'].unstack(), expected_5)
# The following two use USEquityPricing.open and .volume as inputs.
@@ -1269,9 +1279,11 @@ class ParameterizedFactorTestCase(WithTradingEnvironment, ZiplineTestCase):
* self.raw_data[5:] * 2).fillna(0)
assert_frame_equal(results['dv1_nan'].unstack(), expected_1_nan)
expected_5_nan = rolling_mean((self.raw_data_with_nans
* self.raw_data * 2).fillna(0),
window=5)[5:]
expected_5_nan = ((self.raw_data_with_nans * self.raw_data * 2)
.fillna(0)
.rolling(5).mean()
[5:])
assert_frame_equal(results['dv5_nan'].unstack(), expected_5_nan)
+3 -3
View File
@@ -153,7 +153,7 @@ class EventIndexerTestCase(ZiplineTestCase):
@classmethod
def init_class_fixtures(cls):
super(EventIndexerTestCase, cls).init_class_fixtures()
cls.events = make_events(add_nulls=False).sort('event_date')
cls.events = make_events(add_nulls=False).sort_values('event_date')
cls.events.reset_index(inplace=True)
def test_previous_event_indexer(self):
@@ -354,7 +354,7 @@ class EventsLoaderTestCase(WithAssetFinder,
# from pandas won't be tz_localized.
dates = self.trading_days.tz_localize(None)
for asset, asset_result in results.iterkv():
for asset, asset_result in results.iteritems():
relevant_events = events[events.sid == asset.sid]
self.assertEqual(len(relevant_events), 2)
@@ -397,7 +397,7 @@ class EventsLoaderTestCase(WithAssetFinder,
# Remove timezone info from trading days, since the outputs
# from pandas won't be tz_localized.
dates = self.trading_days.tz_localize(None)
for asset, asset_result in results.iterkv():
for asset, asset_result in results.iteritems():
relevant_events = events[events.sid == asset.sid]
self.assertEqual(len(relevant_events), 2)
+5 -2
View File
@@ -346,7 +346,7 @@ class PipelineAlgorithmTestCase(WithBcolzEquityDailyBarReaderFromCSVs,
AAPL = 1
MSFT = 2
BRK_A = 3
assets = ASSET_FINDER_EQUITY_SIDS = AAPL, MSFT, BRK_A
ASSET_FINDER_EQUITY_SIDS = AAPL, MSFT, BRK_A
ASSET_FINDER_EQUITY_SYMBOLS = 'AAPL', 'MSFT', 'BRK_A'
START_DATE = Timestamp('2014')
END_DATE = Timestamp('2015')
@@ -403,6 +403,9 @@ class PipelineAlgorithmTestCase(WithBcolzEquityDailyBarReaderFromCSVs,
)
cls.dates = cls.raw_data[cls.AAPL].index.tz_localize('UTC')
cls.AAPL_split_date = Timestamp("2014-06-09", tz='UTC')
cls.assets = cls.asset_finder.retrieve_all(
cls.ASSET_FINDER_EQUITY_SIDS
)
def compute_expected_vwaps(self, window_lengths):
AAPL, MSFT, BRK_A = self.AAPL, self.MSFT, self.BRK_A
@@ -488,7 +491,7 @@ class PipelineAlgorithmTestCase(WithBcolzEquityDailyBarReaderFromCSVs,
(False,),
])
def test_handle_adjustment(self, set_screen):
AAPL, MSFT, BRK_A = assets = self.AAPL, self.MSFT, self.BRK_A
AAPL, MSFT, BRK_A = assets = self.assets
window_lengths = [1, 2, 5, 10]
vwaps = self.compute_expected_vwaps(window_lengths)
+11 -11
View File
@@ -158,17 +158,17 @@ class TestFastStochasticOscillator(ZiplineTestCase):
fso = FastStochasticOscillator()
today = pd.Timestamp('2015')
assets = np.arange(3, dtype=np.float)
out = np.empty(shape=(3,), dtype=np.float)
assets = np.arange(3, dtype=np.float64)
out = np.empty(shape=(3,), dtype=np.float64)
highs = np.full((50, 3), 3)
lows = np.full((50, 3), 2)
closes = np.full((50, 3), 4)
highs = np.full((50, 3), 3, dtype=np.float64)
lows = np.full((50, 3), 2, dtype=np.float64)
closes = np.full((50, 3), 4, dtype=np.float64)
fso.compute(today, assets, out, closes, lows, highs)
# Expected %K
assert_equal(out, np.full((3,), 200))
assert_equal(out, np.full((3,), 200, dtype=np.float64))
@parameter_space(seed=range(5))
def test_fso_expected_with_talib(self, seed):
@@ -381,7 +381,7 @@ class TestLinearWeightedMovingAverage(ZiplineTestCase):
today = pd.Timestamp('2015')
assets = np.arange(5, dtype=np.int64)
data = np.arange(50, dtype=float).reshape((10, 5))
data = np.arange(50, dtype=np.float64).reshape((10, 5))
out = np.zeros(data.shape[1])
wma2.compute(today, assets, out, data)
@@ -397,9 +397,9 @@ class TestTrueRange(ZiplineTestCase):
assets = np.arange(3, dtype=np.int64)
out = np.empty(3, dtype=np.float64)
highs = np.full((2, 3), 3)
lows = np.full((2, 3), 2)
closes = np.full((2, 3), 1)
highs = np.full((2, 3), 3.)
lows = np.full((2, 3), 2.)
closes = np.full((2, 3), 1.)
tr.compute(today, assets, out, highs, lows, closes)
assert_equal(out, np.full((3,), 2))
assert_equal(out, np.full((3,), 2.))
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
@@ -1 +0,0 @@
{"names": ["open", "high", "low", "close", "volume", "day", "id"]}
@@ -1 +0,0 @@
{"nbytes": 98584, "shape": [24646], "cbytes": 131072}
@@ -1 +0,0 @@
{"chunklen": 32768, "dtype": "uint32", "expectedlen": 24646, "dflt": 0, "cparams": {"shuffle": true, "clevel": 5}}
@@ -1 +0,0 @@
{"nbytes": 98584, "shape": [24646], "cbytes": 131072}
@@ -1 +0,0 @@
{"chunklen": 32768, "dtype": "uint32", "expectedlen": 24646, "dflt": 0, "cparams": {"shuffle": true, "clevel": 5}}
@@ -1 +0,0 @@
{"nbytes": 98584, "shape": [24646], "cbytes": 131072}
@@ -1 +0,0 @@
{"chunklen": 32768, "dtype": "uint32", "expectedlen": 24646, "dflt": 0, "cparams": {"shuffle": true, "clevel": 5}}
@@ -1 +0,0 @@
{"nbytes": 98584, "shape": [24646], "cbytes": 131072}
@@ -1 +0,0 @@
{"chunklen": 32768, "dtype": "uint32", "expectedlen": 24646, "dflt": 0, "cparams": {"shuffle": true, "clevel": 5}}
@@ -1 +0,0 @@
{"nbytes": 98584, "shape": [24646], "cbytes": 131072}
@@ -1 +0,0 @@
{"chunklen": 32768, "dtype": "uint32", "expectedlen": 24646, "dflt": 0, "cparams": {"shuffle": true, "clevel": 5}}
@@ -1 +0,0 @@
{"nbytes": 98584, "shape": [24646], "cbytes": 131072}
@@ -1 +0,0 @@
{"chunklen": 32768, "dtype": "uint32", "expectedlen": 24646, "dflt": 0, "cparams": {"shuffle": true, "clevel": 5}}
@@ -1 +0,0 @@
{"nbytes": 98584, "shape": [24646], "cbytes": 131072}
@@ -1 +0,0 @@
{"chunklen": 32768, "dtype": "uint32", "expectedlen": 24646, "dflt": 0, "cparams": {"shuffle": true, "clevel": 5}}
File diff suppressed because one or more lines are too long
+6 -2
View File
@@ -167,8 +167,12 @@ def main(ctx, rebuild_input):
shutil.rmtree(expected_perf_path)
shutil.copytree(new_perf_path, expected_perf_path)
with tarfile.open(example_path, 'w|gz') as tar:
tar.add(d.getpath('example_data'), 'example_data')
# Clear out all the temporary new perf so it doesn't get added to the
# tarball.
shutil.rmtree(d.getpath('example_data/new_perf/'))
with tarfile.open(example_path, 'w|gz') as tar:
tar.add(d.getpath('example_data'), 'example_data')
if __name__ == '__main__':
+7 -7
View File
@@ -32,7 +32,7 @@ from testfixtures import TempDirectory
import numpy as np
import pandas as pd
import pytz
from pandas.io.common import PerformanceWarning
from pandas.core.common import PerformanceWarning
from zipline import run_algorithm
from zipline import TradingAlgorithm
@@ -3476,7 +3476,7 @@ class TestOrderCancelation(WithDataPortal,
'high': minutes_arr + 2,
'low': minutes_arr - 1,
'close': minutes_arr,
'volume': np.full(minutes_count, 1),
'volume': np.full(minutes_count, 1.0),
},
index=asset_minutes,
)
@@ -3485,11 +3485,11 @@ class TestOrderCancelation(WithDataPortal,
def make_equity_daily_bar_data(cls):
yield 1, pd.DataFrame(
{
'open': np.full(3, 1),
'high': np.full(3, 1),
'low': np.full(3, 1),
'close': np.full(3, 1),
'volume': np.full(3, 1),
'open': np.full(3, 1, dtype=np.float64),
'high': np.full(3, 1, dtype=np.float64),
'low': np.full(3, 1, dtype=np.float64),
'close': np.full(3, 1, dtype=np.float64),
'volume': np.full(3, 1, dtype=np.float64),
},
index=cls.sim_params.sessions,
)
+1 -1
View File
@@ -3,7 +3,7 @@ import warnings
from mock import patch
import numpy as np
import pandas as pd
from pandas.io.common import PerformanceWarning
from pandas.core.common import PerformanceWarning
from zipline import TradingAlgorithm
from zipline.finance.trading import SimulationParameters
+2 -24
View File
@@ -297,7 +297,6 @@ def handle_data(context, data):
@parameterized.expand([("unspecified", ""),
("none", "usecols=None"),
("empty", "usecols=[]"),
("without date", "usecols=['Value']"),
("with date", "usecols=('Value', 'Date')")])
def test_usecols(self, testname, usecols):
@@ -325,30 +324,9 @@ def initialize(context):
context.stocks = [sid(3766), sid(25317)]
def handle_data(context, data):
if {should_have_data}:
try:
data.current("urban", "cpi")
except (KeyError, ValueError):
assert False
else:
try:
data.current("urban", "cpi")
except (KeyError, ValueError):
assert True
data.current("urban", "cpi")
"""
results = self.run_algo(
code.format(
usecols=usecols,
should_have_data=testname in [
'none',
'unspecified',
'without date',
'with date',
],
)
)
results = self.run_algo(code.format(usecols=usecols))
# 251 trading days in 2006
self.assertEqual(len(results), 251)
+29 -15
View File
@@ -1,6 +1,7 @@
from itertools import product
from operator import eq, ne
import numpy as np
import warnings
from zipline.lib.labelarray import LabelArray
from zipline.testing import check_arrays, parameter_space, ZiplineTestCase
@@ -234,22 +235,35 @@ class LabelArrayTestCase(ZiplineTestCase):
l = LabelArray(self.strs, '')
ints = np.arange(len(l))
for func in all_ufuncs():
# Different ufuncs vary between returning NotImplemented and
# raising a TypeError when provided with unknown dtypes.
# This is a bit unfortunate, but still better than silently
# accepting an int array.
try:
if func.nin == 1:
ret = func(l)
elif func.nin == 2:
ret = func(l, ints)
with warnings.catch_warnings():
# Some ufuncs return NotImplemented, but warn that they will fail
# in the future. Both outcomes are fine, so ignore the warnings.
warnings.filterwarnings(
'ignore',
message="unorderable dtypes.*",
category=DeprecationWarning,
)
warnings.filterwarnings(
'ignore',
message="elementwise comparison failed.*",
category=FutureWarning,
)
for func in all_ufuncs():
# Different ufuncs vary between returning NotImplemented and
# raising a TypeError when provided with unknown dtypes.
# This is a bit unfortunate, but still better than silently
# accepting an int array.
try:
if func.nin == 1:
ret = func(l)
elif func.nin == 2:
ret = func(l, ints)
else:
self.fail("Who added a ternary ufunc !?!")
except TypeError:
pass
else:
self.fail("Who added a ternary ufunc !?!")
except TypeError:
pass
else:
self.assertIs(ret, NotImplemented)
self.assertIs(ret, NotImplemented)
@parameter_space(
__fail_fast=True,
+3 -2
View File
@@ -819,8 +819,9 @@ class TradingAlgorithm(object):
else:
self.risk_report = perf
daily_dts = [np.datetime64(perf['period_close'], utc=True)
for perf in daily_perfs]
daily_dts = pd.DatetimeIndex(
[p['period_close'] for p in daily_perfs], tz='UTC'
)
daily_stats = pd.DataFrame(daily_perfs, index=daily_dts)
return daily_stats
+1 -1
View File
@@ -189,7 +189,7 @@ def _generate_output_dataframe(data_subset, defaults):
def _check_asset_group(group):
row = group.sort('end_date').iloc[-1]
row = group.sort_values('end_date').iloc[-1]
row.start_date = group.start_date.min()
row.end_date = group.end_date.max()
row.drop(list(symbol_columns), inplace=True)
+1 -1
View File
@@ -224,7 +224,7 @@ def make_future_info(first_sid,
'exchange': "TEST",
'exchange_full': 'TEST FULL',
})
return pd.DataFrame.from_records(contracts, index='sid').convert_objects()
return pd.DataFrame.from_records(contracts, index='sid')
def make_commodity_future_info(first_sid,
+4 -2
View File
@@ -33,6 +33,8 @@ from zipline.lib.adjustment import Float64Multiply
from zipline.assets.asset_writer import (
SQLITE_MAX_VARIABLE_NUMBER as SQLITE_MAX_IN_STATEMENT,
)
from zipline.utils.pandas_utils import timedelta_to_integral_seconds
_SID_QUERY_TEMPLATE = """
SELECT DISTINCT sid FROM {0}
@@ -170,8 +172,8 @@ cpdef load_adjustments_from_sqlite(object adjustments_db, # sqlite3.Connection
index.
"""
cdef int start_date = int((dates[0] - EPOCH).total_seconds())
cdef int end_date = int((dates[-1] - EPOCH).total_seconds())
cdef int start_date = timedelta_to_integral_seconds(dates[0] - EPOCH)
cdef int end_date = timedelta_to_integral_seconds(dates[-1] - EPOCH)
cdef set split_sids = _get_split_sids(
adjustments_db,
+1 -1
View File
@@ -119,7 +119,7 @@ def fetch_symbol_metadata_frame(api_key,
'name': 'asset_name',
'oldest_available_date': 'start_date',
'newest_available_date': 'end_date',
}).sort('symbol')
}).sort_values('symbol')
data = data[~data.symbol.isin(excluded_symbols)]
# cut out all the other stuff in the name column
+49 -29
View File
@@ -44,6 +44,7 @@ from zipline.utils.math_utils import (
nanstd
)
from zipline.utils.memoize import remember_last, weak_lru_cache
from zipline.utils.pandas_utils import timedelta_to_integral_minutes
from zipline.errors import (
NoTradeDataAvailableTooEarly,
NoTradeDataAvailableTooLate,
@@ -1106,44 +1107,63 @@ class DataPortal(object):
else:
return [assets] if isinstance(assets, Asset) else []
# cache size picked somewhat loosely. this code exists purely to
# handle deprecated API.
@weak_lru_cache(20)
def _get_minute_count_for_transform(self, ending_minute, days_count):
# cache size picked somewhat loosely. this code exists purely to
# handle deprecated API.
# This function works in three steps.
# Step 1. Count the minutes from ``ending_minute`` to the start of its
# session.
# Step 2. Count the minutes from the prior ``days_count - 1`` sessions.
# Step 3. Return the sum of the results from steps (1) and (2).
# bars is the number of days desired. we have to translate that
# into the number of minutes we want.
# we get all the minutes for the last (bars - 1) days, then add
# all the minutes so far today. the +2 is to account for ignoring
# today, and the previous day, in doing the math.
session_for_minute = self.trading_calendar.minute_to_session_label(
ending_minute
)
previous_session = self.trading_calendar.previous_session_label(
session_for_minute
# Example (NYSE Calendar)
# ending_minute = 2016-12-28 9:40 AM US/Eastern
# days_count = 3
# Step 1. Calculate that there are 10 minutes in the ending session.
# Step 2. Calculate that there are 390 + 210 = 600 minutes in the prior
# two sessions. (Prior sessions are 2015-12-23 and 2015-12-24.)
# 2015-12-24 is a half day.
# Step 3. Return 600 + 10 = 610.
cal = self.trading_calendar
ending_session = cal.minute_to_session_label(
ending_minute,
direction="none", # It's an error to pass a non-trading minute.
)
sessions = self.trading_calendar.sessions_in_range(
self.trading_calendar.sessions_window(previous_session,
-days_count + 2)[0],
previous_session,
# Assume that calendar days are always full of contiguous minutes,
# which means we can just take 1 + (number of minutes between the last
# minute and the start of the session). We add one so that we include
# the ending minute in the total.
ending_session_minute_count = timedelta_to_integral_minutes(
ending_minute - cal.open_and_close_for_session(ending_session)[0]
) + 1
if days_count == 1:
# We just need sessions for the active day.
return ending_session_minute_count
# XXX: We're subtracting 2 here to account for two offsets:
# 1. We only want ``days_count - 1`` sessions, since we've already
# accounted for the ending session above.
# 2. The API of ``sessions_window`` is to return one more session than
# the requested number. I don't think any consumers actually want
# that behavior, but it's the tested and documented behavior right
# now, so we have to request one less session than we actually want.
completed_sessions = cal.sessions_window(
cal.previous_session_label(ending_session),
2 - days_count,
)
minutes_count = \
completed_sessions_minute_count = (
self.trading_calendar.minutes_count_for_sessions_in_range(
sessions[0],
sessions[-1]
completed_sessions[0],
completed_sessions[-1]
)
# add the minutes for today
today_open = self.trading_calendar.open_and_close_for_session(
session_for_minute
)[0]
minutes_count += \
((ending_minute - today_open).total_seconds() // 60) + 1
return minutes_count
)
return ending_session_minute_count + completed_sessions_minute_count
def get_simple_transform(self, asset, transform_name, dt, data_frequency,
bars=None):
+3 -3
View File
@@ -695,12 +695,12 @@ class BcolzMinuteBarWriter(object):
all_minutes = self._minute_index
# Get the latest minute we wish to write to the ctable
last_minute_to_write = dts[-1]
last_minute_to_write = pd.Timestamp(dts[-1], tz='UTC')
# In the event that we've already written some minutely data to the
# ctable, guard against overwritting that data.
# ctable, guard against overwriting that data.
if num_rec_mins > 0:
last_recorded_minute = np.datetime64(all_minutes[num_rec_mins - 1])
last_recorded_minute = all_minutes[num_rec_mins - 1]
if last_minute_to_write <= last_recorded_minute:
raise BcolzMinuteOverlappingData(dedent("""
Data with last_date={0} already includes input start={1} for
+1 -2
View File
@@ -54,8 +54,7 @@ def minute_to_session(minute_frame, calendar):
"""
how = OrderedDict((c, _MINUTE_TO_SESSION_OHCLV_HOW[c])
for c in minute_frame.columns)
return minute_frame.groupby(calendar.minute_to_session_label).agg(
how)
return minute_frame.groupby(calendar.minute_to_session_label).agg(how)
class DailyHistoryAggregator(object):
+5 -4
View File
@@ -19,6 +19,7 @@ import logbook
from six import iteritems
import numpy as np
import pandas as pd
from . import risk
@@ -95,11 +96,11 @@ class RiskMetricsPeriod(object):
raise Exception(message)
self.num_trading_days = len(self.benchmark_returns)
self.trading_day_counts = pd.stats.moments.rolling_count(
self.algorithm_returns, self.num_trading_days)
self.mean_algorithm_returns = \
self.algorithm_returns.cumsum() / self.trading_day_counts
self.mean_algorithm_returns = (
self.algorithm_returns.cumsum() /
np.arange(1, self.num_trading_days + 1, dtype=np.float64)
)
self.benchmark_volatility = annual_volatility(self.benchmark_returns)
self.algorithm_volatility = annual_volatility(self.algorithm_returns)
+14 -13
View File
@@ -24,6 +24,7 @@ from zipline.utils.numpy_utils import (
int_dtype_with_size_in_bytes,
is_object,
)
from zipline.utils.pandas_utils import ignore_pandas_nan_categorical_warning
from ._factorize import (
factorize_strings,
@@ -231,8 +232,9 @@ class LabelArray(ndarray):
In this case, obj will be None. We treat this as an error case and
fail.
2. Someone (most likely our own __new__) calls
other_array.view(type=LabelArray).
2. Someone (most likely our own __new__) does::
>>> other_array.view(type=LabelArray) # doctest: +SKIP
In this case, `self` will be the new LabelArray instance, and
``obj` will be the array on which ``view`` is being called.
@@ -283,14 +285,16 @@ class LabelArray(ndarray):
"""
if len(self.shape) > 1:
raise ValueError("Can't convert a 2D array to a categorical.")
return pd.Categorical.from_codes(
self.as_int_array(),
# We need to make a copy because pandas >= 0.17 fails if this
# buffer isn't writeable.
self.categories.copy(),
ordered=False,
name=name,
)
with ignore_pandas_nan_categorical_warning():
return pd.Categorical.from_codes(
self.as_int_array(),
# We need to make a copy because pandas >= 0.17 fails if this
# buffer isn't writeable.
self.categories.copy(),
ordered=False,
name=name,
)
def as_categorical_frame(self, index, columns, name=None):
"""
@@ -475,9 +479,6 @@ class LabelArray(ndarray):
)
def __repr__(self):
# This happens if you call a ufunc on a LabelArray that changes the
# dtype. This is generally an indicator that the array has been used
# incorrectly, and it means we're no longer valid for anything.
repr_lines = repr(self.as_string_array()).splitlines()
repr_lines[0] = repr_lines[0].replace('array(', 'LabelArray(', 1)
repr_lines[-1] = repr_lines[-1].rsplit(',', 1)[0] + ')'
+1 -10
View File
@@ -18,7 +18,7 @@ from numpy import apply_along_axis, float64, isnan, nan
from scipy.stats import rankdata
from zipline.utils.numpy_utils import (
is_float,
is_missing,
float64_dtype,
int64_dtype,
datetime64ns_dtype,
@@ -28,15 +28,6 @@ from zipline.utils.numpy_utils import (
import_array()
cpdef is_missing(ndarray data, object missing_value):
"""
Generic is_missing function that handles quirks with NaN.
"""
if is_float(data) and isnan(missing_value):
return isnan(data)
return (data == missing_value)
def rankdata_1d_descending(ndarray data, str method):
"""
1D descending version of scipy.stats.rankdata.
+1 -2
View File
@@ -187,7 +187,6 @@ from zipline.utils.input_validation import (
optionally,
)
from zipline.utils.numpy_utils import bool_dtype, categorical_dtype
from zipline.utils.pandas_utils import sort_values
from zipline.utils.pool import SequentialPool
from zipline.utils.preprocess import preprocess
@@ -754,7 +753,7 @@ def overwrite_novel_deltas(baseline, deltas, dates):
ignore_index=True,
copy=False,
)
sort_values(cat, TS_FIELD_NAME, inplace=True)
cat.sort_values(TS_FIELD_NAME, inplace=True)
return cat, non_novel_deltas
+4 -2
View File
@@ -109,10 +109,12 @@ class EventsLoader(PipelineLoader):
events = events[events[EVENT_DATE_FIELD_NAME].notnull()]
# We always work with entries from ``events`` directly as numpy arrays,
# so we coerce from a frame here.
# so we coerce from a frame to a dict of arrays here.
self.events = {
name: np.asarray(series)
for name, series in events.sort(EVENT_DATE_FIELD_NAME).iteritems()
for name, series in (
events.sort_values(EVENT_DATE_FIELD_NAME).iteritems()
)
}
# Columns to load with self.load_next_events.
+1 -2
View File
@@ -16,7 +16,6 @@ from pandas import (
from zipline.lib.adjusted_array import AdjustedArray
from zipline.lib.adjustment import make_adjustment_from_labels
from zipline.utils.numpy_utils import as_column
from zipline.utils.pandas_utils import sort_values
from .base import PipelineLoader
ADJUSTMENT_COLUMNS = Index([
@@ -73,7 +72,7 @@ class DataFrameLoader(PipelineLoader):
else:
# Ensure that columns are in the correct order.
adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1)
sort_values(adjustments, ['apply_date', 'sid'], inplace=True)
adjustments.sort_values(['apply_date', 'sid'], inplace=True)
self.adjustments = adjustments
self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date)
+2 -2
View File
@@ -172,7 +172,7 @@ class SeededRandomLoader(PrecomputedLoader):
"""
Return uniformly-distributed integers between 0 and 100.
"""
return (self.state.random_integers(low=0, high=100, size=shape)
return (self.state.randint(low=0, high=100, size=shape)
.astype('int64')) # default is system int
def _datetime_values(self, shape):
@@ -180,7 +180,7 @@ class SeededRandomLoader(PrecomputedLoader):
Return uniformly-distributed dates in 2014.
"""
start = Timestamp('2014', tz='UTC').asm8
offsets = self.state.random_integers(
offsets = self.state.randint(
low=0,
high=364,
size=shape,
+3 -3
View File
@@ -224,13 +224,13 @@ class PandasCSV(with_metaclass(ABCMeta, object)):
date_str_series.values,
format=format_str,
utc=True,
coerce=True,
errors='coerce',
)
else:
parsed = pd.to_datetime(
date_str_series.values,
format=format_str,
coerce=True,
errors='coerce',
).tz_localize(tz_str).tz_convert('UTC')
if data_frequency == 'daily':
@@ -301,7 +301,7 @@ class PandasCSV(with_metaclass(ABCMeta, object)):
df['sid'] = self.symbol
elif self.finder:
df.sort(self.symbol_column)
df.sort_values(by=self.symbol_column, inplace=True)
# Pop the 'sid' column off of the DataFrame, just in case the user
# has assigned it, and throw a warning
+16 -3
View File
@@ -49,7 +49,8 @@ from zipline.pipeline.loaders.testing import make_seeded_random_loader
from zipline.utils import security_list
from zipline.utils.calendars import get_calendar
from zipline.utils.input_validation import expect_dimensions
from zipline.utils.numpy_utils import as_column
from zipline.utils.numpy_utils import as_column, isnat
from zipline.utils.pandas_utils import timedelta_to_integral_seconds
from zipline.utils.sentinel import sentinel
import numpy as np
@@ -78,7 +79,7 @@ def str_to_seconds(s):
>>> str_to_seconds('2014-01-01')
1388534400
"""
return int((pd.Timestamp(s, tz='UTC') - EPOCH).total_seconds())
return timedelta_to_integral_seconds(pd.Timestamp(s, tz='UTC') - EPOCH)
def drain_zipline(test, zipline):
@@ -393,6 +394,18 @@ def check_arrays(x, y, err_msg='', verbose=True, check_dtypes=True):
# ...then check the actual values as well.
x = x.as_string_array()
y = y.as_string_array()
elif x.dtype.kind in 'mM':
x_isnat = isnat(x)
y_isnat = isnat(y)
assert_array_equal(
x_isnat,
y_isnat,
err_msg="NaTs not equal",
verbose=verbose,
)
# Fill NaTs with zero for comparison.
x = np.where(x_isnat, np.zeros_like(x), x)
y = np.where(x_isnat, np.zeros_like(x), x)
return assert_array_equal(x, y, err_msg=err_msg, verbose=verbose)
@@ -709,7 +722,7 @@ class FakeDataPortal(DataPortal):
]
df = pd.DataFrame(
np.full((bar_count, len(assets)), 100),
np.full((bar_count, len(assets)), 100.0),
index=days,
columns=assets
)
+9 -1
View File
@@ -14,6 +14,7 @@
# limitations under the License.
from abc import ABCMeta, abstractproperty
from lru import LRU
import warnings
from pandas.tseries.holiday import AbstractHolidayCalendar
from six import with_metaclass
@@ -66,7 +67,14 @@ class TradingCalendar(with_metaclass(ABCMeta)):
"""
def __init__(self, start=start_default, end=end_default):
# Midnight in UTC for each trading day.
_all_days = date_range(start, end, freq=self.day, tz='UTC')
# In pandas 0.18.1, pandas calls into its own code here in a way that
# fires a warning. The calling code in pandas tries to suppress the
# warning, but does so incorrectly, causing it to bubble out here.
# Actually catch and suppress the warning here:
with warnings.catch_warnings():
warnings.simplefilter('ignore')
_all_days = date_range(start, end, freq=self.day, tz='UTC')
# `DatetimeIndex`s of standard opens/closes for each day.
self._opens = days_at_time(_all_days, self.open_time, self.tz,
+56 -15
View File
@@ -15,12 +15,17 @@
from abc import ABCMeta, abstractmethod
from collections import namedtuple
import six
import warnings
import datetime
import numpy as np
import pandas as pd
import pytz
from toolz import curry
from zipline.utils.input_validation import preprocess
from zipline.utils.memoize import lazyval
from .context_tricks import nop_context
@@ -147,6 +152,31 @@ def _build_time(time, kwargs):
return datetime.time(**kwargs)
@curry
def lossless_float_to_int(funcname, func, argname, arg):
"""
A preprocessor that coerces integral floats to ints.
Receipt of non-integral floats raises a TypeError.
"""
if not isinstance(arg, float):
return arg
arg_as_int = int(arg)
if arg == arg_as_int:
warnings.warn(
"{f} expected an int for argument {name!r}, but got float {arg}."
" Coercing to int.".format(
f=funcname,
name=argname,
arg=arg,
),
)
return arg_as_int
raise TypeError(arg)
class EventManager(object):
"""Manages a list of Event objects.
This manages the logic for checking the rules and dispatching to the
@@ -401,23 +431,28 @@ class NotHalfDay(StatelessRule):
class TradingDayOfWeekRule(six.with_metaclass(ABCMeta, StatelessRule)):
@preprocess(n=lossless_float_to_int('TradingDayOfWeekRule'))
def __init__(self, n, invert):
if not 0 <= n < MAX_WEEK_RANGE:
raise _out_of_range_error(MAX_WEEK_RANGE)
self.td_delta = (-n - 1) if invert else n
@lazyval
def execution_periods(self):
# calculate the list of periods that match the given criteria
return self.cal.schedule.groupby(
pd.Grouper(freq="W")
).nth(int(self.td_delta)).index
def should_trigger(self, dt):
# is this market minute's period in the list of execution periods?
return self.cal.minute_to_session_label(dt) in \
self.execution_periods
val = self.cal.minute_to_session_label(dt, direction="none").value
return val in self.execution_period_values
@lazyval
def execution_period_values(self):
# calculate the list of periods that match the given criteria
sessions = self.cal.all_sessions
return set(
pd.Series(data=sessions)
.groupby([sessions.year, sessions.weekofyear])
.nth(self.td_delta)
.astype(np.int64)
)
class NthTradingDayOfWeek(TradingDayOfWeekRule):
@@ -438,6 +473,8 @@ class NDaysBeforeLastTradingDayOfWeek(TradingDayOfWeekRule):
class TradingDayOfMonthRule(six.with_metaclass(ABCMeta, StatelessRule)):
@preprocess(n=lossless_float_to_int('TradingDayOfMonthRule'))
def __init__(self, n, invert):
if not 0 <= n < MAX_MONTH_RANGE:
raise _out_of_range_error(MAX_MONTH_RANGE)
@@ -448,15 +485,19 @@ class TradingDayOfMonthRule(six.with_metaclass(ABCMeta, StatelessRule)):
def should_trigger(self, dt):
# is this market minute's period in the list of execution periods?
return self.cal.minute_to_session_label(dt) in \
self.execution_periods
value = self.cal.minute_to_session_label(dt, direction="none").value
return value in self.execution_period_values
@lazyval
def execution_periods(self):
def execution_period_values(self):
# calculate the list of periods that match the given criteria
return self.cal.schedule.groupby(
pd.Grouper(freq="M")
).nth(int(self.td_delta)).index
sessions = self.cal.all_sessions
return set(
pd.Series(data=sessions)
.groupby([sessions.year, sessions.month])
.nth(self.td_delta)
.astype(np.int64)
)
class NthTradingDayOfMonth(TradingDayOfMonthRule):
+32 -8
View File
@@ -1,6 +1,7 @@
"""
Utilities for working with numpy arrays.
"""
from collections import OrderedDict
from datetime import datetime
from warnings import (
catch_warnings,
@@ -16,6 +17,7 @@ from numpy import (
empty,
flatnonzero,
hstack,
isnan,
nan,
vectorize,
where
@@ -60,12 +62,12 @@ _FILLVALUE_DEFAULTS = {
object_dtype: None,
}
INT_DTYPES_BY_SIZE_BYTES = {
1: dtype('int8'),
2: dtype('int16'),
4: dtype('int32'),
8: dtype('int64'),
}
INT_DTYPES_BY_SIZE_BYTES = OrderedDict([
(1, dtype('int8')),
(2, dtype('int16')),
(4, dtype('int32')),
(8, dtype('int64')),
])
def int_dtype_with_size_in_bytes(size):
@@ -285,6 +287,28 @@ def rolling_window(array, length):
# Sentinel value that isn't NaT.
_notNaT = make_datetime64D(0)
iNaT = NaTns.view(int64_dtype)
assert iNaT == NaTD.view(int64_dtype), "iNaTns != iNaTD"
def isnat(obj):
"""
Check if a value is np.NaT.
"""
if obj.dtype.kind not in ('m', 'M'):
raise ValueError("%s is not a numpy datetime or timedelta")
return obj.view(int64_dtype) == iNaT
def is_missing(data, missing_value):
"""
Generic is_missing function that handles NaN and NaT.
"""
if is_float(data) and isnan(missing_value):
return isnan(data)
elif is_datetime(data) and isnat(missing_value):
return isnat(data)
return (data == missing_value)
def busday_count_mask_NaT(begindates, enddates, out=None):
@@ -302,8 +326,8 @@ def busday_count_mask_NaT(begindates, enddates, out=None):
if out is None:
out = empty(broadcast(begindates, enddates).shape, dtype=float)
beginmask = (begindates == NaTD)
endmask = (enddates == NaTD)
beginmask = isnat(begindates)
endmask = isnat(enddates)
out = busday_count(
# Temporarily fill in non-NaT values.
+33 -13
View File
@@ -1,8 +1,10 @@
"""
Utilities for working with pandas objects.
"""
from contextlib import contextmanager
from itertools import product
import operator as op
import warnings
import pandas as pd
from distutils.version import StrictVersion
@@ -10,6 +12,10 @@ from distutils.version import StrictVersion
pandas_version = StrictVersion(pd.__version__)
def july_5th_holiday_observance(datetime_index):
return datetime_index[datetime_index.year != 2013]
def explode(df):
"""
Take a DataFrame and return a triple of
@@ -19,19 +25,6 @@ def explode(df):
return df.index, df.columns, df.values
try:
# This branch is hit in pandas 17
sort_values = pd.DataFrame.sort_values
except AttributeError:
# This branch is hit in pandas 16
sort_values = pd.DataFrame.sort
if pandas_version >= StrictVersion('0.17.1'):
july_5th_holiday_observance = lambda dtix: dtix[dtix.year != 2013]
else:
july_5th_holiday_observance = lambda dt: None if dt.year == 2013 else dt
def _time_to_micros(time):
"""Convert a time into microseconds since midnight.
Parameters
@@ -146,3 +139,30 @@ def nearest_unequal_elements(dts, dt):
upper_value = dts[upper_ix] if upper_ix < len(dts) else None
return lower_value, upper_value
def timedelta_to_integral_seconds(delta):
"""
Convert a pd.Timedelta to a number of seconds as an int.
"""
return int(delta.total_seconds())
def timedelta_to_integral_minutes(delta):
"""
Convert a pd.Timedelta to a number of minutes as an int.
"""
return timedelta_to_integral_seconds(delta) // 60
@contextmanager
def ignore_pandas_nan_categorical_warning():
with warnings.catch_warnings():
# Pandas >= 0.18 doesn't like null-ish values in catgories, but
# avoiding that requires a broader change to how missing values are
# handled in pipeline, so for now just silence the warning.
warnings.filterwarnings(
'ignore',
category=FutureWarning,
)
yield