From 2f8768bb066a912ad052e552ee67151291cbd25c Mon Sep 17 00:00:00 2001
From: fredfortier <fredfortier@gmail.com>
Date: Thu, 21 Sep 2017 16:36:10 -0400
Subject: [PATCH] Merged Victor's hack for the minute writer precision

---
 catalyst/data/minute_bars.py | 106 +++++++++++++++++------------------
 1 file changed, 53 insertions(+), 53 deletions(-)

diff --git a/catalyst/data/minute_bars.py b/catalyst/data/minute_bars.py
index d2707122..9cae04af 100644
--- a/catalyst/data/minute_bars.py
+++ b/catalyst/data/minute_bars.py
@@ -39,7 +39,7 @@ from catalyst.data._minute_bar_internal import (
 from catalyst.gens.sim_engine import NANOS_IN_MINUTE
 
 from catalyst.data.bar_reader import BarReader, NoDataOnDate
-from catalyst.data.us_equity_pricing import check_uint32_safe
+from catalyst.data.us_equity_pricing import check_uint64_safe
 from catalyst.utils.calendars import get_calendar
 from catalyst.utils.cli import maybe_show_progress
 from catalyst.utils.memoize import lazyval
@@ -52,7 +52,7 @@ FUTURES_MINUTES_PER_DAY = 1440
 
 DEFAULT_EXPECTEDLEN = US_EQUITIES_MINUTES_PER_DAY * 252 * 15
 
-OHLC_RATIO = 1000
+OHLC_RATIO = 100000000
 
 
 class BcolzMinuteOverlappingData(Exception):
@@ -114,15 +114,15 @@ def _sid_subdir_path(sid):
 
 
 def convert_cols(cols, scale_factor, sid, invalid_data_behavior):
-    """Adapt OHLCV columns into uint32 columns.
+    """Adapt OHLCV columns into uint64 columns.
 
     Parameters
     ----------
     cols : dict
         A dict mapping each column name (open, high, low, close, volume)
-        to a float column to convert to uint32.
+        to a float column to convert to uint64.
     scale_factor : int
-        Factor to use to scale float values before converting to uint32.
+        Factor to use to scale float values before converting to uint64.
     sid : int
         Sid of the relevant asset, for logging.
     invalid_data_behavior : str
@@ -135,6 +135,7 @@ def convert_cols(cols, scale_factor, sid, invalid_data_behavior):
     scaled_highs = np.nan_to_num(cols['high']) * scale_factor
     scaled_lows = np.nan_to_num(cols['low']) * scale_factor
     scaled_closes = np.nan_to_num(cols['close']) * scale_factor
+    scaled_volumes = np.nan_to_num(cols['volume']) * scale_factor
 
     exclude_mask = np.zeros_like(scaled_opens, dtype=bool)
 
@@ -143,11 +144,12 @@ def convert_cols(cols, scale_factor, sid, invalid_data_behavior):
         ('high', scaled_highs),
         ('low', scaled_lows),
         ('close', scaled_closes),
+        ('volume', scaled_volumes),
     ]:
         max_val = scaled_col.max()
 
         try:
-            check_uint32_safe(max_val, col_name)
+            check_uint64_safe(max_val, col_name)
         except ValueError:
             if invalid_data_behavior == 'raise':
                 raise
@@ -155,20 +157,20 @@ def convert_cols(cols, scale_factor, sid, invalid_data_behavior):
             if invalid_data_behavior == 'warn':
                 logger.warn(
                     'Values for sid={}, col={} contain some too large for '
-                    'uint32 (max={}), filtering them out',
+                    'uint64 (max={}), filtering them out',
                     sid, col_name, max_val,
                 )
 
             # We want to exclude all rows that have an unsafe value in
             # this column.
-            exclude_mask &= (scaled_col >= np.iinfo(np.uint32).max)
+            exclude_mask &= (scaled_col >= np.iinfo(np.uint64).max)
 
     # Convert all cols to uint32.
-    opens = scaled_opens.astype(np.uint32)
-    highs = scaled_highs.astype(np.uint32)
-    lows = scaled_lows.astype(np.uint32)
-    closes = scaled_closes.astype(np.uint32)
-    volumes = cols['volume'].astype(np.uint32)
+    opens = scaled_opens.astype(np.uint64)
+    highs = scaled_highs.astype(np.uint64)
+    lows = scaled_lows.astype(np.uint64)
+    closes = scaled_closes.astype(np.uint64)
+    volumes = scaled_volumes.astype(np.uint64)
 
     # Exclude rows with unsafe values by setting to zero.
     opens[exclude_mask] = 0
@@ -260,14 +262,14 @@ class BcolzMinuteBarMetadata(object):
             )
 
     def __init__(
-        self,
-        default_ohlc_ratio,
-        ohlc_ratios_per_sid,
-        calendar,
-        start_session,
-        end_session,
-        minutes_per_day,
-        version=FORMAT_VERSION,
+            self,
+            default_ohlc_ratio,
+            ohlc_ratios_per_sid,
+            calendar,
+            start_session,
+            end_session,
+            minutes_per_day,
+            version=FORMAT_VERSION,
     ):
         self.calendar = calendar
         self.start_session = start_session
@@ -288,7 +290,7 @@ class BcolzMinuteBarMetadata(object):
         ohlc_ratio : int
             The default ratio by which to multiply the pricing data to
             convert the floats from floats to an integer to fit within
-            the np.uint32. If ohlc_ratios_per_sid is None or does not
+            the np.uint64. If ohlc_ratios_per_sid is None or does not
             contain a mapping for a given sid, this ratio is used.
         ohlc_ratios_per_sid : dict
              A dict mapping each sid in the output to the factor by
@@ -340,10 +342,10 @@ class BcolzMinuteBarMetadata(object):
             'first_trading_day': str(self.start_session.date()),
             'market_opens': (
                 market_opens.values.astype('datetime64[m]').
-                astype(np.int64).tolist()),
+                    astype(np.int64).tolist()),
             'market_closes': (
                 market_closes.values.astype('datetime64[m]').
-                astype(np.int64).tolist()),
+                    astype(np.int64).tolist()),
         }
         with open(self.metadata_path(rootdir), 'w+') as fp:
             json.dump(metadata, fp)
@@ -372,13 +374,13 @@ class BcolzMinuteBarWriter(object):
         The last trading session in the data set.
     default_ohlc_ratio : int, optional
         The default ratio by which to multiply the pricing data to
-        convert from floats to integers that fit within np.uint32. If
+        convert from floats to integers that fit within np.uint64. If
         ohlc_ratios_per_sid is None or does not contain a mapping for a
-        given sid, this ratio is used. Default is OHLC_RATIO (1000).
+        given sid, this ratio is used. Default is OHLC_RATIO (10^8).
     ohlc_ratios_per_sid : dict, optional
         A dict mapping each sid in the output to the ratio by which to
         multiply the pricing data to convert the floats from floats to
-        an integer to fit within the np.uint32.
+        an integer to fit within the np.uint64.
     expectedlen : int, optional
         The expected length of the dataset, used when creating the initial
         bcolz ctable.
@@ -401,11 +403,9 @@ class BcolzMinuteBarWriter(object):
     Each individual asset's data is stored as a bcolz table with a column for
     each pricing field: (open, high, low, close, volume)
 
-    The open, high, low, and close columns are integers which are 1000 times
+    The open, high, low, close and volume columns are integers which are 10^8 times
     the quoted price, so that the data can represented and stored as an
-    np.uint32, supporting market prices quoted up to the thousands place.
-
-    volume is a np.uint32 with no mutation of the tens place.
+    np.uint64, supporting market prices quoted up to the 1/10^8-th place.
 
     The 'index' for each individual asset are a repeating period of minutes of
     length `minutes_per_day` starting from each market open.
@@ -573,7 +573,7 @@ class BcolzMinuteBarWriter(object):
         if not os.path.exists(sid_containing_dirname):
             # Other sids may have already created the containing directory.
             os.makedirs(sid_containing_dirname)
-        initial_array = np.empty(0, np.uint32)
+        initial_array = np.empty(0, np.uint64)
         table = ctable(
             rootdir=path,
             columns=[
@@ -610,7 +610,7 @@ class BcolzMinuteBarWriter(object):
         minute_offset = len(table) % self._minutes_per_day
         num_to_prepend = numdays * self._minutes_per_day - minute_offset
 
-        prepend_array = np.zeros(num_to_prepend, np.uint32)
+        prepend_array = np.zeros(num_to_prepend, np.uint64)
         # Fill all OHLCV with zeros.
         table.append([prepend_array] * 5)
         table.flush()
@@ -815,11 +815,11 @@ class BcolzMinuteBarWriter(object):
 
         minutes_count = all_minutes_in_window.size
 
-        open_col = np.zeros(minutes_count, dtype=np.uint32)
-        high_col = np.zeros(minutes_count, dtype=np.uint32)
-        low_col = np.zeros(minutes_count, dtype=np.uint32)
-        close_col = np.zeros(minutes_count, dtype=np.uint32)
-        vol_col = np.zeros(minutes_count, dtype=np.uint32)
+        open_col = np.zeros(minutes_count, dtype=np.uint64)
+        high_col = np.zeros(minutes_count, dtype=np.uint64)
+        low_col = np.zeros(minutes_count, dtype=np.uint64)
+        close_col = np.zeros(minutes_count, dtype=np.uint64)
+        vol_col = np.zeros(minutes_count, dtype=np.uint64)
 
         dt_ixs = np.searchsorted(all_minutes_in_window.values,
                                  dts.astype('datetime64[ns]'))
@@ -914,10 +914,10 @@ class BcolzMinuteBarReader(MinuteBarReader):
         )
         self._schedule = self.calendar.schedule[slicer]
         self._market_opens = self._schedule.market_open
-        self._market_open_values = self._market_opens.values.\
+        self._market_open_values = self._market_opens.values. \
             astype('datetime64[m]').astype(np.int64)
         self._market_closes = self._schedule.market_close
-        self._market_close_values = self._market_closes.values.\
+        self._market_close_values = self._market_closes.values. \
             astype('datetime64[m]').astype(np.int64)
 
         self._default_ohlc_inverse = 1.0 / metadata.default_ohlc_ratio
@@ -1125,8 +1125,8 @@ class BcolzMinuteBarReader(MinuteBarReader):
             else:
                 return np.nan
 
-        if field != 'volume':
-            value *= self._ohlc_ratio_inverse_for_sid(sid)
+        #if field != 'volume':
+        value *= self._ohlc_ratio_inverse_for_sid(sid)
         return value
 
     def get_last_traded_dt(self, asset, dt):
@@ -1206,7 +1206,7 @@ class BcolzMinuteBarReader(MinuteBarReader):
             minute_dt.value / NANOS_IN_MINUTE,
             self._minutes_per_day,
             False,
-        )
+            )
 
     def load_raw_arrays(self, fields, start_dt, end_dt, sids):
         """
@@ -1248,7 +1248,7 @@ class BcolzMinuteBarReader(MinuteBarReader):
             if field != 'volume':
                 out = np.full(shape, np.nan)
             else:
-                out = np.zeros(shape, dtype=np.uint32)
+                out = np.zeros(shape, dtype=np.uint64)
 
             for i, sid in enumerate(sids):
                 carray = self._open_minute_file(field, sid)
@@ -1256,17 +1256,17 @@ class BcolzMinuteBarReader(MinuteBarReader):
                 if indices_to_exclude is not None:
                     for excl_start, excl_stop in indices_to_exclude[::-1]:
                         excl_slice = np.s_[
-                            excl_start - start_idx:excl_stop - start_idx + 1]
+                                     excl_start - start_idx:excl_stop - start_idx + 1]
                         values = np.delete(values, excl_slice)
 
                 where = values != 0
                 # first slice down to len(where) because we might not have
                 # written data for all the minutes requested
-                if field != 'volume':
-                    out[:len(where), i][where] = (
-                        values[where] * self._ohlc_ratio_inverse_for_sid(sid))
-                else:
-                    out[:len(where), i][where] = values[where]
+                #if field != 'volume':
+                out[:len(where), i][where] = (
+                    values[where] * self._ohlc_ratio_inverse_for_sid(sid))
+                #else:
+                #    out[:len(where), i][where] = values[where]
 
             results.append(out)
         return results
@@ -1319,9 +1319,9 @@ class H5MinuteBarUpdateWriter(object):
 
     def __init__(self, path, complevel=None, complib=None):
         self._complevel = complevel if complevel \
-            is not None else self._COMPLEVEL
+                                       is not None else self._COMPLEVEL
         self._complib = complib if complib \
-            is not None else self._COMPLIB
+                                   is not None else self._COMPLIB
         self._path = path
 
     def write(self, frames):
@@ -1358,4 +1358,4 @@ class H5MinuteBarUpdateReader(MinuteBarUpdateReader):
 
     def read(self, dts, sids):
         panel = self._panel[sids, dts, :]
-        return panel.iteritems()
+        return panel.iteritems()
\ No newline at end of file