From c0d0bc33d22ef4a940d6cfaec07fdb7cb19f2b10 Mon Sep 17 00:00:00 2001 From: Conner Fromknecht Date: Thu, 6 Jul 2017 15:01:08 -0700 Subject: [PATCH] Converted OHLCV columns to use uint64. Default tick size now 1/1000th of a coin --- catalyst/data/_equities.pyx | 14 +++++++++----- catalyst/data/us_equity_pricing.py | 31 +++++++++++++++++------------- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/catalyst/data/_equities.pyx b/catalyst/data/_equities.pyx index 5b80aada..81f9a66e 100644 --- a/catalyst/data/_equities.pyx +++ b/catalyst/data/_equities.pyx @@ -21,6 +21,7 @@ from numpy import ( float64, intp, uint32, + uint64, zeros, ) from numpy cimport ( @@ -28,6 +29,7 @@ from numpy cimport ( intp_t, ndarray, uint32_t, + uint64_t, uint8_t, ) from numpy.math cimport NAN @@ -167,8 +169,8 @@ cpdef _read_bcolz_data(ctable_t table, int nassets str column_name carray_t carray - ndarray[dtype=uint32_t, ndim=1] raw_data - ndarray[dtype=uint32_t, ndim=2] outbuf + ndarray[dtype=uint64_t, ndim=1] raw_data + ndarray[dtype=uint64_t, ndim=2] outbuf ndarray[dtype=uint8_t, ndim=2, cast=True] where_nan ndarray[dtype=float64_t, ndim=2] outbuf_as_float intp_t asset @@ -185,7 +187,7 @@ cpdef _read_bcolz_data(ctable_t table, raise ValueError("Incompatible index arrays.") for column_name in columns: - outbuf = zeros(shape=shape, dtype=uint32) + outbuf = zeros(shape=shape, dtype=uint64) if read_all: raw_data = table[column_name][:] @@ -213,11 +215,13 @@ cpdef _read_bcolz_data(ctable_t table, else: continue - if column_name in {'open', 'high', 'low', 'close'}: + if column_name in ['open', 'high', 'low', 'close']: where_nan = (outbuf == 0) - outbuf_as_float = outbuf.astype(float64) * .001 + outbuf_as_float = outbuf.astype(float64) * .000001 outbuf_as_float[where_nan] = NAN results.append(outbuf_as_float) + elif column_name != 'volume': + results.append(outbuf.astype(uint32)) else: results.append(outbuf) return results diff --git a/catalyst/data/us_equity_pricing.py b/catalyst/data/us_equity_pricing.py index 73281719..f20ac003 100644 --- a/catalyst/data/us_equity_pricing.py +++ b/catalyst/data/us_equity_pricing.py @@ -34,6 +34,7 @@ from numpy import ( issubdtype, nan, uint32, + uint64, ) from pandas import ( DataFrame, @@ -80,6 +81,7 @@ from ._adjustments import load_adjustments_from_sqlite logger = logbook.Logger('UsEquityPricing') OHLC = frozenset(['open', 'high', 'low', 'close']) +OHLCV = frozenset(['open', 'high', 'low', 'close', 'volume']) US_EQUITY_PRICING_BCOLZ_COLUMNS = ( 'open', 'high', 'low', 'close', 'volume', 'day', 'id' ) @@ -109,6 +111,7 @@ SQLITE_STOCK_DIVIDEND_PAYOUT_COLUMN_DTYPES = { 'ratio': float, } UINT32_MAX = iinfo(uint32).max +UINT64_MAX = iinfo(uint64).max def check_uint32_safe(value, colname): @@ -119,25 +122,25 @@ def check_uint32_safe(value, colname): @expect_element(invalid_data_behavior={'warn', 'raise', 'ignore'}) -def winsorise_uint32(df, invalid_data_behavior, column, *columns): - """Drops any record where a value would not fit into a uint32. +def winsorise_uint64(df, invalid_data_behavior, column, *columns): + """Drops any record where a value would not fit into a uint64. Parameters ---------- df : pd.DataFrame The dataframe to winsorise. invalid_data_behavior : {'warn', 'raise', 'ignore'} - What to do when data is outside the bounds of a uint32. + What to do when data is outside the bounds of a uint64. *columns : iterable[str] The names of the columns to check. Returns ------- truncated : pd.DataFrame - ``df`` with values that do not fit into a uint32 zeroed out. + ``df`` with values that do not fit into a uint64 zeroed out. """ columns = list((column,) + columns) - mask = df[columns] > UINT32_MAX + mask = df[columns] > UINT64_MAX if invalid_data_behavior != 'ignore': mask |= df[columns].isnull() @@ -150,14 +153,14 @@ def winsorise_uint32(df, invalid_data_behavior, column, *columns): if mv.any(): if invalid_data_behavior == 'raise': raise ValueError( - '%d values out of bounds for uint32: %r' % ( + '%d values out of bounds for uint64: %r' % ( mv.sum(), df[mask.any(axis=1)], ), ) if invalid_data_behavior == 'warn': warnings.warn( 'Ignoring %d values because they are out of bounds for' - ' uint32: %r' % ( + ' uint64: %r' % ( mv.sum(), df[mask.any(axis=1)], ), stacklevel=3, # one extra frame for `expect_element` @@ -239,7 +242,7 @@ class BcolzDailyBarWriter(object): Whether or not to show a progress bar while writing. invalid_data_behavior : {'warn', 'raise', 'ignore'}, optional What to do when data is encountered that is outside the range of - a uint32. + a uint64. Returns ------- @@ -274,7 +277,7 @@ class BcolzDailyBarWriter(object): Whether or not to show a progress bar while writing. invalid_data_behavior : {'warn', 'raise', 'ignore'} What to do when data is encountered that is outside the range of - a uint32. + a uint64. """ read = partial( read_csv, @@ -302,7 +305,9 @@ class BcolzDailyBarWriter(object): # Maps column name -> output carray. columns = { - k: carray(array([], dtype=uint32)) + k: carray(array([], dtype=uint64)) + if k in OHLCV + else carray(array([], dtype=uint32)) for k in US_EQUITY_PRICING_BCOLZ_COLUMNS } @@ -417,12 +422,12 @@ class BcolzDailyBarWriter(object): # we already have a ctable so do nothing return raw_data - winsorise_uint32(raw_data, invalid_data_behavior, 'volume', *OHLC) - processed = (raw_data[list(OHLC)] * 1000).astype('uint32') + winsorise_uint64(raw_data, invalid_data_behavior, 'volume', *OHLC) + processed = (raw_data[list(OHLC)] * 1000000).astype('uint64') dates = raw_data.index.values.astype('datetime64[s]') check_uint32_safe(dates.max().view(np.int64), 'day') processed['day'] = dates.astype('uint32') - processed['volume'] = raw_data.volume.astype('uint32') + processed['volume'] = raw_data.volume.astype('uint64') return ctable.fromdataframe(processed)