Converted OHLCV columns to use uint64. Default tick size now 1/1000th of a coin

This commit is contained in:
Conner Fromknecht
2017-07-06 15:01:08 -07:00
parent d9762e8646
commit c0d0bc33d2
2 changed files with 27 additions and 18 deletions
+9 -5
View File
@@ -21,6 +21,7 @@ from numpy import (
float64,
intp,
uint32,
uint64,
zeros,
)
from numpy cimport (
@@ -28,6 +29,7 @@ from numpy cimport (
intp_t,
ndarray,
uint32_t,
uint64_t,
uint8_t,
)
from numpy.math cimport NAN
@@ -167,8 +169,8 @@ cpdef _read_bcolz_data(ctable_t table,
int nassets
str column_name
carray_t carray
ndarray[dtype=uint32_t, ndim=1] raw_data
ndarray[dtype=uint32_t, ndim=2] outbuf
ndarray[dtype=uint64_t, ndim=1] raw_data
ndarray[dtype=uint64_t, ndim=2] outbuf
ndarray[dtype=uint8_t, ndim=2, cast=True] where_nan
ndarray[dtype=float64_t, ndim=2] outbuf_as_float
intp_t asset
@@ -185,7 +187,7 @@ cpdef _read_bcolz_data(ctable_t table,
raise ValueError("Incompatible index arrays.")
for column_name in columns:
outbuf = zeros(shape=shape, dtype=uint32)
outbuf = zeros(shape=shape, dtype=uint64)
if read_all:
raw_data = table[column_name][:]
@@ -213,11 +215,13 @@ cpdef _read_bcolz_data(ctable_t table,
else:
continue
if column_name in {'open', 'high', 'low', 'close'}:
if column_name in ['open', 'high', 'low', 'close']:
where_nan = (outbuf == 0)
outbuf_as_float = outbuf.astype(float64) * .001
outbuf_as_float = outbuf.astype(float64) * .000001
outbuf_as_float[where_nan] = NAN
results.append(outbuf_as_float)
elif column_name != 'volume':
results.append(outbuf.astype(uint32))
else:
results.append(outbuf)
return results
+18 -13
View File
@@ -34,6 +34,7 @@ from numpy import (
issubdtype,
nan,
uint32,
uint64,
)
from pandas import (
DataFrame,
@@ -80,6 +81,7 @@ from ._adjustments import load_adjustments_from_sqlite
logger = logbook.Logger('UsEquityPricing')
OHLC = frozenset(['open', 'high', 'low', 'close'])
OHLCV = frozenset(['open', 'high', 'low', 'close', 'volume'])
US_EQUITY_PRICING_BCOLZ_COLUMNS = (
'open', 'high', 'low', 'close', 'volume', 'day', 'id'
)
@@ -109,6 +111,7 @@ SQLITE_STOCK_DIVIDEND_PAYOUT_COLUMN_DTYPES = {
'ratio': float,
}
UINT32_MAX = iinfo(uint32).max
UINT64_MAX = iinfo(uint64).max
def check_uint32_safe(value, colname):
@@ -119,25 +122,25 @@ def check_uint32_safe(value, colname):
@expect_element(invalid_data_behavior={'warn', 'raise', 'ignore'})
def winsorise_uint32(df, invalid_data_behavior, column, *columns):
"""Drops any record where a value would not fit into a uint32.
def winsorise_uint64(df, invalid_data_behavior, column, *columns):
"""Drops any record where a value would not fit into a uint64.
Parameters
----------
df : pd.DataFrame
The dataframe to winsorise.
invalid_data_behavior : {'warn', 'raise', 'ignore'}
What to do when data is outside the bounds of a uint32.
What to do when data is outside the bounds of a uint64.
*columns : iterable[str]
The names of the columns to check.
Returns
-------
truncated : pd.DataFrame
``df`` with values that do not fit into a uint32 zeroed out.
``df`` with values that do not fit into a uint64 zeroed out.
"""
columns = list((column,) + columns)
mask = df[columns] > UINT32_MAX
mask = df[columns] > UINT64_MAX
if invalid_data_behavior != 'ignore':
mask |= df[columns].isnull()
@@ -150,14 +153,14 @@ def winsorise_uint32(df, invalid_data_behavior, column, *columns):
if mv.any():
if invalid_data_behavior == 'raise':
raise ValueError(
'%d values out of bounds for uint32: %r' % (
'%d values out of bounds for uint64: %r' % (
mv.sum(), df[mask.any(axis=1)],
),
)
if invalid_data_behavior == 'warn':
warnings.warn(
'Ignoring %d values because they are out of bounds for'
' uint32: %r' % (
' uint64: %r' % (
mv.sum(), df[mask.any(axis=1)],
),
stacklevel=3, # one extra frame for `expect_element`
@@ -239,7 +242,7 @@ class BcolzDailyBarWriter(object):
Whether or not to show a progress bar while writing.
invalid_data_behavior : {'warn', 'raise', 'ignore'}, optional
What to do when data is encountered that is outside the range of
a uint32.
a uint64.
Returns
-------
@@ -274,7 +277,7 @@ class BcolzDailyBarWriter(object):
Whether or not to show a progress bar while writing.
invalid_data_behavior : {'warn', 'raise', 'ignore'}
What to do when data is encountered that is outside the range of
a uint32.
a uint64.
"""
read = partial(
read_csv,
@@ -302,7 +305,9 @@ class BcolzDailyBarWriter(object):
# Maps column name -> output carray.
columns = {
k: carray(array([], dtype=uint32))
k: carray(array([], dtype=uint64))
if k in OHLCV
else carray(array([], dtype=uint32))
for k in US_EQUITY_PRICING_BCOLZ_COLUMNS
}
@@ -417,12 +422,12 @@ class BcolzDailyBarWriter(object):
# we already have a ctable so do nothing
return raw_data
winsorise_uint32(raw_data, invalid_data_behavior, 'volume', *OHLC)
processed = (raw_data[list(OHLC)] * 1000).astype('uint32')
winsorise_uint64(raw_data, invalid_data_behavior, 'volume', *OHLC)
processed = (raw_data[list(OHLC)] * 1000000).astype('uint64')
dates = raw_data.index.values.astype('datetime64[s]')
check_uint32_safe(dates.max().view(np.int64), 'day')
processed['day'] = dates.astype('uint32')
processed['volume'] = raw_data.volume.astype('uint32')
processed['volume'] = raw_data.volume.astype('uint64')
return ctable.fromdataframe(processed)