From e5039a43b09bbcdf6bfd20e6f54abd623312ffd1 Mon Sep 17 00:00:00 2001
From: Maya Tydykov <mtydykov@quantopian.com>
Date: Wed, 18 May 2016 15:48:04 -0400
Subject: [PATCH] TST: add tests to ensure no forward filling of non-missing
 values

STY: fix indentation

DOC: add docs to clarify test input/output
---
 tests/pipeline/test_blaze.py           | 209 ++++++++++++++++++++++---
 zipline/pipeline/loaders/blaze/core.py |  32 ++--
 2 files changed, 212 insertions(+), 29 deletions(-)

diff --git a/tests/pipeline/test_blaze.py b/tests/pipeline/test_blaze.py
index 512be3ee..742a6b70 100644
--- a/tests/pipeline/test_blaze.py
+++ b/tests/pipeline/test_blaze.py
@@ -270,19 +270,190 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase):
             NonPipelineField,
         )
 
-    def test_cols_with_missing_vals(self):
-        dates = (self.dates[0], self.dates[-1])
+    def test_cols_with_all_missing_vals(self):
+        """
+        Tests that when there is no known data, we get output where the
+        columns have the right dtypes and the right missing values filled in.
+
+        input (self.df):
+        Empty DataFrame
+        Columns: [sid, float_value, str_value, int_value, bool_value, dt_value,
+            asof_date, timestamp]
+        Index: []
+
+        output (expected)
+                                          str_value  float_value  int_value
+        2014-01-01 Equity(65 [A])      None          NaN          0
+                   Equity(66 [B])      None          NaN          0
+                   Equity(67 [C])      None          NaN          0
+        2014-01-02 Equity(65 [A])      None          NaN          0
+                   Equity(66 [B])      None          NaN          0
+                   Equity(67 [C])      None          NaN          0
+        2014-01-03 Equity(65 [A])      None          NaN          0
+                   Equity(66 [B])      None          NaN          0
+                   Equity(67 [C])      None          NaN          0
+
+                                  dt_value  bool_value
+        2014-01-01 Equity(65 [A])      NaT  False
+                   Equity(66 [B])      NaT  False
+                   Equity(67 [C])      NaT  False
+        2014-01-02 Equity(65 [A])      NaT  False
+                   Equity(66 [B])      NaT  False
+                   Equity(67 [C])      NaT  False
+        2014-01-03 Equity(65 [A])      NaT  False
+                   Equity(66 [B])      NaT  False
+                   Equity(67 [C])      NaT  False
+        """
+        df = pd.DataFrame(columns=['sid', 'float_value', 'str_value',
+                                   'int_value', 'bool_value', 'dt_value',
+                                   'asof_date', 'timestamp'])
+
+        expr = bz.data(
+            df,
+            dshape="""
+            var * {
+                 sid: int64,
+                 float_value: float64,
+                 str_value: string,
+                 int_value: int64,
+                 bool_value: bool,
+                 dt_value: datetime,
+                 asof_date: datetime,
+                 timestamp: datetime,
+            }""",
+        )
+        fields = OrderedDict(expr.dshape.measure.fields)
+
+        expected = pd.DataFrame({
+            "str_value": np.array([None,
+                                   None,
+                                   None,
+                                   None,
+                                   None,
+                                   None,
+                                   None,
+                                   None,
+                                   None],
+                                  dtype='object'),
+            "float_value": np.array([np.NaN,
+                                     np.NaN,
+                                     np.NaN,
+                                     np.NaN,
+                                     np.NaN,
+                                     np.NaN,
+                                     np.NaN,
+                                     np.NaN,
+                                     np.NaN],
+                                    dtype='float64'),
+            "int_value": np.array([0,
+                                   0,
+                                   0,
+                                   0,
+                                   0,
+                                   0,
+                                   0,
+                                   0,
+                                   0],
+                                  dtype='int64'),
+            "bool_value": np.array([False,
+                                    False,
+                                    False,
+                                    False,
+                                    False,
+                                    False,
+                                    False,
+                                    False,
+                                    False],
+                                   dtype='bool'),
+            "dt_value": [pd.NaT,
+                         pd.NaT,
+                         pd.NaT,
+                         pd.NaT,
+                         pd.NaT,
+                         pd.NaT,
+                         pd.NaT,
+                         pd.NaT,
+                         pd.NaT],
+        },
+            columns=['str_value', 'float_value', 'int_value', 'bool_value',
+                     'dt_value'],
+            index=pd.MultiIndex.from_product(
+                (self.dates, self.asset_finder.retrieve_all(
+                    self.ASSET_FINDER_EQUITY_SIDS
+                ))
+            )
+        )
+
+        self._test_id(
+            df,
+            var * Record(fields),
+            expected,
+            self.asset_finder,
+            ('float_value', 'str_value', 'int_value', 'bool_value',
+             'dt_value'),
+        )
+
+    def test_cols_with_some_missing_vals(self):
+        """
+        Tests the following:
+            1) Forward filling replaces missing values correctly for the data
+            types supported in pipeline.
+            2) We don't forward fill when the missing value is the actual value
+             we got for a date in the case of int/bool columns.
+            3) We get the correct type of missing value in the output.
+
+        input (self.df):
+           asof_date bool_value   dt_value  float_value  int_value  sid
+        0 2014-01-01       True 2011-01-01            0          1   65
+        1 2014-01-03       True 2011-01-02            1          2   66
+        2 2014-01-01       True 2011-01-03            2          3   67
+        3 2014-01-02      False        NaT          NaN          0   67
+
+          str_value  timestamp
+        0         a  2014-01-01
+        1         b  2014-01-03
+        2         c  2014-01-01
+        3      None  2014-01-02
+
+        output (expected)
+                                  str_value  float_value  int_value bool_value
+        2014-01-01 Equity(65 [A])         a            0          1       True
+                   Equity(66 [B])      None          NaN          0      False
+                   Equity(67 [C])         c            2          3       True
+        2014-01-02 Equity(65 [A])         a            0          1       True
+                   Equity(66 [B])      None          NaN          0      False
+                   Equity(67 [C])         c            2          0      False
+        2014-01-03 Equity(65 [A])         a            0          1       True
+                   Equity(66 [B])         b            1          2       True
+                   Equity(67 [C])         c            2          0      False
+
+                                    dt_value
+        2014-01-01 Equity(65 [A]) 2011-01-01
+                   Equity(66 [B])        NaT
+                   Equity(67 [C]) 2011-01-03
+        2014-01-02 Equity(65 [A]) 2011-01-01
+                   Equity(66 [B])        NaT
+                   Equity(67 [C]) 2011-01-03
+        2014-01-03 Equity(65 [A]) 2011-01-01
+                   Equity(66 [B]) 2011-01-02
+                   Equity(67 [C]) 2011-01-03
+        """
+        dates = (self.dates[0], self.dates[-1], self.dates[0], self.dates[1])
         df = pd.DataFrame({
-            'sid': self.ASSET_FINDER_EQUITY_SIDS[:-1],
-            'float_value': (0., 1.,),
-            'str_value': ("a", "b",),
-            'int_value': (1, 2),
-            'bool_value': (True, True),
+            'sid': self.ASSET_FINDER_EQUITY_SIDS[:-1] +
+            (self.ASSET_FINDER_EQUITY_SIDS[-1],)*2,
+            'float_value': (0., 1., 2., np.NaN),
+            'str_value': ("a", "b", "c", None),
+            'int_value': (1, 2, 3, 0),
+            'bool_value': (True, True, True, False),
             'dt_value': (pd.Timestamp('2011-01-01'),
-                         pd.Timestamp('2011-01-02')),
+                         pd.Timestamp('2011-01-02'),
+                         pd.Timestamp('2011-01-03'),
+                         pd.NaT),
             'asof_date': dates,
             'timestamp': dates,
         })
+
         expr = bz.data(
             df,
             dshape="""
@@ -302,27 +473,27 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase):
         expected = pd.DataFrame({
             "str_value": np.array(["a",
                                    None,
-                                   None,
+                                   "c",
                                    "a",
                                    None,
-                                   None,
+                                   "c",
                                    "a",
                                    "b",
-                                   None],
+                                   "c"],
                                   dtype='object'),
             "float_value": np.array([0,
                                      np.NaN,
-                                     np.NaN,
+                                     2,
                                      0,
                                      np.NaN,
-                                     np.NaN,
+                                     2,
                                      0,
                                      1,
-                                     np.NaN],
+                                     2],
                                     dtype='float64'),
             "int_value": np.array([1,
                                    0,
-                                   0,
+                                   3,
                                    1,
                                    0,
                                    0,
@@ -332,7 +503,7 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase):
                                   dtype='int64'),
             "bool_value": np.array([True,
                                     False,
-                                    False,
+                                    True,
                                     True,
                                     False,
                                     False,
@@ -342,13 +513,13 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase):
                                    dtype='bool'),
             "dt_value": [pd.Timestamp('2011-01-01'),
                          pd.NaT,
-                         pd.NaT,
+                         pd.Timestamp('2011-01-03'),
                          pd.Timestamp('2011-01-01'),
                          pd.NaT,
-                         pd.NaT,
+                         pd.Timestamp('2011-01-03'),
                          pd.Timestamp('2011-01-01'),
                          pd.Timestamp('2011-01-02'),
-                         pd.NaT],
+                         pd.Timestamp('2011-01-03')],
         },
             columns=['str_value', 'float_value', 'int_value', 'bool_value',
                      'dt_value'],
diff --git a/zipline/pipeline/loaders/blaze/core.py b/zipline/pipeline/loaders/blaze/core.py
index 28af4faf..d17612a4 100644
--- a/zipline/pipeline/loaders/blaze/core.py
+++ b/zipline/pipeline/loaders/blaze/core.py
@@ -179,7 +179,6 @@ from zipline.utils.input_validation import (
 from zipline.utils.numpy_utils import (
     categorical_dtype,
     repeat_last_axis,
-    datetime64ns_dtype
 )
 from zipline.utils.pandas_utils import sort_values
 from zipline.utils.preprocess import preprocess
@@ -1035,25 +1034,38 @@ class BlazeLoader(dict):
 
         sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False)
         dense_output = last_in_date_group(sparse_output, reindex=True)
-        dense_output = dense_output.ffill()
+        dense_output.ffill(inplace=True)
 
-        # Unstack will fill all missing values with NaN; we need to fix
-        # this for all types that are not float.
+        # Fill in missing values specified by each column. This is made
+        # significantly more complex by the fact that we need to work around
+        # two pandas issues:
+
+        # 1) When we have sids, if there are no records for a given sid for any
+        #    dates, pandas will generate a column full of NaNs for that sid.
+        #    This means that some of the columns in `dense_output` are now
+        #    float instead of the intended dtype, so we have to coerce back to
+        #    our expected type and convert NaNs into the desired missing value.
+
+        # 2) DataFrame.ffill assumes that receiving None as a fill-value means
+        #    that no value was passed.  Consequently, there's no way to tell
+        #    pandas to replace NaNs in an object column with None using fillna,
+        #    so we have to roll our own instead using df.where.
         for column in columns:
+            # Special logic for strings since `fillna` doesn't work if the
+            # missing value is `None`.
             if column.dtype == categorical_dtype:
                 dense_output[column.name] = dense_output[
                     column.name
                 ].where(pd.notnull(dense_output[column.name]),
                         column.missing_value)
-            # Need to convert from float col to datetime col
-            elif column.dtype == datetime64ns_dtype:
-                dense_output[column.name] = dense_output[
-                    column.name
-                ].astype('datetime64[ns]')
             else:
+                # We need to execute `fillna` before `astype` in case the
+                # column contains NaNs and needs to be cast to bool or int.
+                # This is so that the NaNs are replaced first, since pandas
+                # can't convert NaNs for those types.
                 dense_output[column.name] = dense_output[
                     column.name
-                ].fillna(column.missing_value)
+                ].fillna(column.missing_value).astype(column.dtype)
 
         if have_sids:
             adjustments_from_deltas = adjustments_from_deltas_with_sids