From c0d0bc33d22ef4a940d6cfaec07fdb7cb19f2b10 Mon Sep 17 00:00:00 2001
From: Conner Fromknecht <conner@mit.edu>
Date: Thu, 6 Jul 2017 15:01:08 -0700
Subject: [PATCH] Converted OHLCV columns to use uint64. Default tick size now
 1/1000th of a coin

---
 catalyst/data/_equities.pyx        | 14 +++++++++-----
 catalyst/data/us_equity_pricing.py | 31 +++++++++++++++++-------------
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/catalyst/data/_equities.pyx b/catalyst/data/_equities.pyx
index 5b80aada..81f9a66e 100644
--- a/catalyst/data/_equities.pyx
+++ b/catalyst/data/_equities.pyx
@@ -21,6 +21,7 @@ from numpy import (
     float64,
     intp,
     uint32,
+    uint64,
     zeros,
 )
 from numpy cimport (
@@ -28,6 +29,7 @@ from numpy cimport (
     intp_t,
     ndarray,
     uint32_t,
+    uint64_t,
     uint8_t,
 )
 from numpy.math cimport NAN
@@ -167,8 +169,8 @@ cpdef _read_bcolz_data(ctable_t table,
         int nassets
         str column_name
         carray_t carray
-        ndarray[dtype=uint32_t, ndim=1] raw_data
-        ndarray[dtype=uint32_t, ndim=2] outbuf
+        ndarray[dtype=uint64_t, ndim=1] raw_data
+        ndarray[dtype=uint64_t, ndim=2] outbuf
         ndarray[dtype=uint8_t, ndim=2, cast=True] where_nan
         ndarray[dtype=float64_t, ndim=2] outbuf_as_float
         intp_t asset
@@ -185,7 +187,7 @@ cpdef _read_bcolz_data(ctable_t table,
         raise ValueError("Incompatible index arrays.")
 
     for column_name in columns:
-        outbuf = zeros(shape=shape, dtype=uint32)
+        outbuf = zeros(shape=shape, dtype=uint64)
         if read_all:
             raw_data = table[column_name][:]
 
@@ -213,11 +215,13 @@ cpdef _read_bcolz_data(ctable_t table,
                 else:
                     continue
 
-        if column_name in {'open', 'high', 'low', 'close'}:
+        if column_name in ['open', 'high', 'low', 'close']:
             where_nan = (outbuf == 0)
-            outbuf_as_float = outbuf.astype(float64) * .001
+            outbuf_as_float = outbuf.astype(float64) * .000001
             outbuf_as_float[where_nan] = NAN
             results.append(outbuf_as_float)
+        elif column_name != 'volume':
+            results.append(outbuf.astype(uint32))
         else:
             results.append(outbuf)
     return results
diff --git a/catalyst/data/us_equity_pricing.py b/catalyst/data/us_equity_pricing.py
index 73281719..f20ac003 100644
--- a/catalyst/data/us_equity_pricing.py
+++ b/catalyst/data/us_equity_pricing.py
@@ -34,6 +34,7 @@ from numpy import (
     issubdtype,
     nan,
     uint32,
+    uint64,
 )
 from pandas import (
     DataFrame,
@@ -80,6 +81,7 @@ from ._adjustments import load_adjustments_from_sqlite
 logger = logbook.Logger('UsEquityPricing')
 
 OHLC = frozenset(['open', 'high', 'low', 'close'])
+OHLCV = frozenset(['open', 'high', 'low', 'close', 'volume'])
 US_EQUITY_PRICING_BCOLZ_COLUMNS = (
     'open', 'high', 'low', 'close', 'volume', 'day', 'id'
 )
@@ -109,6 +111,7 @@ SQLITE_STOCK_DIVIDEND_PAYOUT_COLUMN_DTYPES = {
     'ratio': float,
 }
 UINT32_MAX = iinfo(uint32).max
+UINT64_MAX = iinfo(uint64).max
 
 
 def check_uint32_safe(value, colname):
@@ -119,25 +122,25 @@ def check_uint32_safe(value, colname):
 
 
 @expect_element(invalid_data_behavior={'warn', 'raise', 'ignore'})
-def winsorise_uint32(df, invalid_data_behavior, column, *columns):
-    """Drops any record where a value would not fit into a uint32.
+def winsorise_uint64(df, invalid_data_behavior, column, *columns):
+    """Drops any record where a value would not fit into a uint64.
 
     Parameters
     ----------
     df : pd.DataFrame
         The dataframe to winsorise.
     invalid_data_behavior : {'warn', 'raise', 'ignore'}
-        What to do when data is outside the bounds of a uint32.
+        What to do when data is outside the bounds of a uint64.
     *columns : iterable[str]
         The names of the columns to check.
 
     Returns
     -------
     truncated : pd.DataFrame
-        ``df`` with values that do not fit into a uint32 zeroed out.
+        ``df`` with values that do not fit into a uint64 zeroed out.
     """
     columns = list((column,) + columns)
-    mask = df[columns] > UINT32_MAX
+    mask = df[columns] > UINT64_MAX
 
     if invalid_data_behavior != 'ignore':
         mask |= df[columns].isnull()
@@ -150,14 +153,14 @@ def winsorise_uint32(df, invalid_data_behavior, column, *columns):
     if mv.any():
         if invalid_data_behavior == 'raise':
             raise ValueError(
-                '%d values out of bounds for uint32: %r' % (
+                '%d values out of bounds for uint64: %r' % (
                     mv.sum(), df[mask.any(axis=1)],
                 ),
             )
         if invalid_data_behavior == 'warn':
             warnings.warn(
                 'Ignoring %d values because they are out of bounds for'
-                ' uint32: %r' % (
+                ' uint64: %r' % (
                     mv.sum(), df[mask.any(axis=1)],
                 ),
                 stacklevel=3,  # one extra frame for `expect_element`
@@ -239,7 +242,7 @@ class BcolzDailyBarWriter(object):
             Whether or not to show a progress bar while writing.
         invalid_data_behavior : {'warn', 'raise', 'ignore'}, optional
             What to do when data is encountered that is outside the range of
-            a uint32.
+            a uint64.
 
         Returns
         -------
@@ -274,7 +277,7 @@ class BcolzDailyBarWriter(object):
             Whether or not to show a progress bar while writing.
         invalid_data_behavior : {'warn', 'raise', 'ignore'}
             What to do when data is encountered that is outside the range of
-            a uint32.
+            a uint64.
         """
         read = partial(
             read_csv,
@@ -302,7 +305,9 @@ class BcolzDailyBarWriter(object):
 
         # Maps column name -> output carray.
         columns = {
-            k: carray(array([], dtype=uint32))
+            k: carray(array([], dtype=uint64))
+               if k in OHLCV
+               else carray(array([], dtype=uint32))
             for k in US_EQUITY_PRICING_BCOLZ_COLUMNS
         }
 
@@ -417,12 +422,12 @@ class BcolzDailyBarWriter(object):
             # we already have a ctable so do nothing
             return raw_data
 
-        winsorise_uint32(raw_data, invalid_data_behavior, 'volume', *OHLC)
-        processed = (raw_data[list(OHLC)] * 1000).astype('uint32')
+        winsorise_uint64(raw_data, invalid_data_behavior, 'volume', *OHLC)
+        processed = (raw_data[list(OHLC)] * 1000000).astype('uint64')
         dates = raw_data.index.values.astype('datetime64[s]')
         check_uint32_safe(dates.max().view(np.int64), 'day')
         processed['day'] = dates.astype('uint32')
-        processed['volume'] = raw_data.volume.astype('uint32')
+        processed['volume'] = raw_data.volume.astype('uint64')
         return ctable.fromdataframe(processed)