From be08a77d7684b91fec5bc0f358ef786631b4355f Mon Sep 17 00:00:00 2001 From: Eddie Hebert Date: Mon, 28 Mar 2016 15:26:28 -0400 Subject: [PATCH] BUG: Prevent writing int max instead of nan. np.array.astype can not be relied upon to convert nan's reliably to 0 Fix by calling nan_to_num on the float arrays before converting to uint32. --- tests/data/test_minute_bars.py | 93 +++++++++++++++++++++++++++++++++- zipline/data/minute_bars.py | 14 +++-- 2 files changed, 100 insertions(+), 7 deletions(-) diff --git a/tests/data/test_minute_bars.py b/tests/data/test_minute_bars.py index f1da7ace..443c8f8e 100644 --- a/tests/data/test_minute_bars.py +++ b/tests/data/test_minute_bars.py @@ -17,14 +17,23 @@ import os from unittest import TestCase -from numpy import nan, array -from numpy.testing import assert_almost_equal +from numpy import ( + arange, + array, + int64, + float64, + full, + nan, + zeros, +) +from numpy.testing import assert_almost_equal, assert_array_equal from pandas import ( DataFrame, DatetimeIndex, Timestamp, Timedelta, NaT, + date_range, ) from testfixtures import TempDirectory @@ -451,6 +460,86 @@ class BcolzMinuteBarTestCase(TestCase): self.assertEquals(100.0, volume_price) + def test_nans(self): + """ + Test writing empty data. + """ + sid = 1 + last_date = self.writer.last_date_in_output_for_sid(sid) + self.assertIs(last_date, NaT) + + self.writer.pad(sid, TEST_CALENDAR_START) + + last_date = self.writer.last_date_in_output_for_sid(sid) + self.assertEqual(last_date, TEST_CALENDAR_START) + + freq = self.market_opens.index.freq + minute = self.market_opens[TEST_CALENDAR_START + freq] + minutes = date_range(minute, periods=9, freq='min') + data = DataFrame( + data={ + 'open': full(9, nan), + 'high': full(9, nan), + 'low': full(9, nan), + 'close': full(9, nan), + 'volume': full(9, 0), + }, + index=[minutes]) + self.writer.write(sid, data) + + fields = ['open', 'high', 'low', 'close', 'volume'] + + ohlcv_window = self.reader.unadjusted_window( + fields, minutes[0], minutes[-1], [sid]) + + for i, field in enumerate(fields): + if field != 'volume': + assert_array_equal(full(9, nan), ohlcv_window[i][0]) + else: + assert_array_equal(zeros(9), ohlcv_window[i][0]) + + def test_differing_nans(self): + """ + Also test nans of differing values/construction. + """ + sid = 1 + last_date = self.writer.last_date_in_output_for_sid(sid) + self.assertIs(last_date, NaT) + + self.writer.pad(sid, TEST_CALENDAR_START) + + last_date = self.writer.last_date_in_output_for_sid(sid) + self.assertEqual(last_date, TEST_CALENDAR_START) + + freq = self.market_opens.index.freq + minute = self.market_opens[TEST_CALENDAR_START + freq] + minutes = date_range(minute, periods=9, freq='min') + data = DataFrame( + data={ + 'open': ((0b11111111111 << 52) + arange(1, 10, dtype=int64)). + view(float64), + 'high': ((0b11111111111 << 52) + arange(11, 20, dtype=int64)). + view(float64), + 'low': ((0b11111111111 << 52) + arange(21, 30, dtype=int64)). + view(float64), + 'close': ((0b11111111111 << 52) + arange(31, 40, dtype=int64)). + view(float64), + 'volume': full(9, 0), + }, + index=[minutes]) + self.writer.write(sid, data) + + fields = ['open', 'high', 'low', 'close', 'volume'] + + ohlcv_window = self.reader.unadjusted_window( + fields, minutes[0], minutes[-1], [sid]) + + for i, field in enumerate(fields): + if field != 'volume': + assert_array_equal(full(9, nan), ohlcv_window[i][0]) + else: + assert_array_equal(zeros(9), ohlcv_window[i][0]) + def test_write_cols(self): minute_0 = self.market_opens[self.test_calendar_start] minute_1 = minute_0 + timedelta(minutes=1) diff --git a/zipline/data/minute_bars.py b/zipline/data/minute_bars.py index 1885d520..d3cede5d 100644 --- a/zipline/data/minute_bars.py +++ b/zipline/data/minute_bars.py @@ -16,6 +16,7 @@ from textwrap import dedent import bcolz from bcolz import ctable import numpy as np +from numpy import nan_to_num from os.path import join import json import os @@ -526,11 +527,14 @@ class BcolzMinuteBarWriter(object): dts.astype('datetime64[ns]')) ohlc_ratio = self._ohlc_ratio - open_col[dt_ixs] = (cols['open'] * ohlc_ratio).astype(np.uint32) - high_col[dt_ixs] = (cols['high'] * ohlc_ratio).astype(np.uint32) - low_col[dt_ixs] = (cols['low'] * ohlc_ratio).astype(np.uint32) - close_col[dt_ixs] = (cols['close'] * ohlc_ratio).astype( - np.uint32) + open_col[dt_ixs] = (nan_to_num(cols['open']) * ohlc_ratio).\ + astype(np.uint32) + high_col[dt_ixs] = (nan_to_num(cols['high']) * ohlc_ratio).\ + astype(np.uint32) + low_col[dt_ixs] = (nan_to_num(cols['low']) * ohlc_ratio).\ + astype(np.uint32) + close_col[dt_ixs] = (nan_to_num(cols['close']) * ohlc_ratio).\ + astype(np.uint32) vol_col[dt_ixs] = cols['volume'].astype(np.uint32) table.append([