From e5039a43b09bbcdf6bfd20e6f54abd623312ffd1 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Wed, 18 May 2016 15:48:04 -0400 Subject: [PATCH] TST: add tests to ensure no forward filling of non-missing values STY: fix indentation DOC: add docs to clarify test input/output --- tests/pipeline/test_blaze.py | 209 ++++++++++++++++++++++--- zipline/pipeline/loaders/blaze/core.py | 32 ++-- 2 files changed, 212 insertions(+), 29 deletions(-) diff --git a/tests/pipeline/test_blaze.py b/tests/pipeline/test_blaze.py index 512be3ee..742a6b70 100644 --- a/tests/pipeline/test_blaze.py +++ b/tests/pipeline/test_blaze.py @@ -270,19 +270,190 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase): NonPipelineField, ) - def test_cols_with_missing_vals(self): - dates = (self.dates[0], self.dates[-1]) + def test_cols_with_all_missing_vals(self): + """ + Tests that when there is no known data, we get output where the + columns have the right dtypes and the right missing values filled in. + + input (self.df): + Empty DataFrame + Columns: [sid, float_value, str_value, int_value, bool_value, dt_value, + asof_date, timestamp] + Index: [] + + output (expected) + str_value float_value int_value + 2014-01-01 Equity(65 [A]) None NaN 0 + Equity(66 [B]) None NaN 0 + Equity(67 [C]) None NaN 0 + 2014-01-02 Equity(65 [A]) None NaN 0 + Equity(66 [B]) None NaN 0 + Equity(67 [C]) None NaN 0 + 2014-01-03 Equity(65 [A]) None NaN 0 + Equity(66 [B]) None NaN 0 + Equity(67 [C]) None NaN 0 + + dt_value bool_value + 2014-01-01 Equity(65 [A]) NaT False + Equity(66 [B]) NaT False + Equity(67 [C]) NaT False + 2014-01-02 Equity(65 [A]) NaT False + Equity(66 [B]) NaT False + Equity(67 [C]) NaT False + 2014-01-03 Equity(65 [A]) NaT False + Equity(66 [B]) NaT False + Equity(67 [C]) NaT False + """ + df = pd.DataFrame(columns=['sid', 'float_value', 'str_value', + 'int_value', 'bool_value', 'dt_value', + 'asof_date', 'timestamp']) + + expr = bz.data( + df, + dshape=""" + var * { + sid: int64, + float_value: float64, + str_value: string, + int_value: int64, + bool_value: bool, + dt_value: datetime, + asof_date: datetime, + timestamp: datetime, + }""", + ) + fields = OrderedDict(expr.dshape.measure.fields) + + expected = pd.DataFrame({ + "str_value": np.array([None, + None, + None, + None, + None, + None, + None, + None, + None], + dtype='object'), + "float_value": np.array([np.NaN, + np.NaN, + np.NaN, + np.NaN, + np.NaN, + np.NaN, + np.NaN, + np.NaN, + np.NaN], + dtype='float64'), + "int_value": np.array([0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0], + dtype='int64'), + "bool_value": np.array([False, + False, + False, + False, + False, + False, + False, + False, + False], + dtype='bool'), + "dt_value": [pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT], + }, + columns=['str_value', 'float_value', 'int_value', 'bool_value', + 'dt_value'], + index=pd.MultiIndex.from_product( + (self.dates, self.asset_finder.retrieve_all( + self.ASSET_FINDER_EQUITY_SIDS + )) + ) + ) + + self._test_id( + df, + var * Record(fields), + expected, + self.asset_finder, + ('float_value', 'str_value', 'int_value', 'bool_value', + 'dt_value'), + ) + + def test_cols_with_some_missing_vals(self): + """ + Tests the following: + 1) Forward filling replaces missing values correctly for the data + types supported in pipeline. + 2) We don't forward fill when the missing value is the actual value + we got for a date in the case of int/bool columns. + 3) We get the correct type of missing value in the output. + + input (self.df): + asof_date bool_value dt_value float_value int_value sid + 0 2014-01-01 True 2011-01-01 0 1 65 + 1 2014-01-03 True 2011-01-02 1 2 66 + 2 2014-01-01 True 2011-01-03 2 3 67 + 3 2014-01-02 False NaT NaN 0 67 + + str_value timestamp + 0 a 2014-01-01 + 1 b 2014-01-03 + 2 c 2014-01-01 + 3 None 2014-01-02 + + output (expected) + str_value float_value int_value bool_value + 2014-01-01 Equity(65 [A]) a 0 1 True + Equity(66 [B]) None NaN 0 False + Equity(67 [C]) c 2 3 True + 2014-01-02 Equity(65 [A]) a 0 1 True + Equity(66 [B]) None NaN 0 False + Equity(67 [C]) c 2 0 False + 2014-01-03 Equity(65 [A]) a 0 1 True + Equity(66 [B]) b 1 2 True + Equity(67 [C]) c 2 0 False + + dt_value + 2014-01-01 Equity(65 [A]) 2011-01-01 + Equity(66 [B]) NaT + Equity(67 [C]) 2011-01-03 + 2014-01-02 Equity(65 [A]) 2011-01-01 + Equity(66 [B]) NaT + Equity(67 [C]) 2011-01-03 + 2014-01-03 Equity(65 [A]) 2011-01-01 + Equity(66 [B]) 2011-01-02 + Equity(67 [C]) 2011-01-03 + """ + dates = (self.dates[0], self.dates[-1], self.dates[0], self.dates[1]) df = pd.DataFrame({ - 'sid': self.ASSET_FINDER_EQUITY_SIDS[:-1], - 'float_value': (0., 1.,), - 'str_value': ("a", "b",), - 'int_value': (1, 2), - 'bool_value': (True, True), + 'sid': self.ASSET_FINDER_EQUITY_SIDS[:-1] + + (self.ASSET_FINDER_EQUITY_SIDS[-1],)*2, + 'float_value': (0., 1., 2., np.NaN), + 'str_value': ("a", "b", "c", None), + 'int_value': (1, 2, 3, 0), + 'bool_value': (True, True, True, False), 'dt_value': (pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02')), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03'), + pd.NaT), 'asof_date': dates, 'timestamp': dates, }) + expr = bz.data( df, dshape=""" @@ -302,27 +473,27 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase): expected = pd.DataFrame({ "str_value": np.array(["a", None, - None, + "c", "a", None, - None, + "c", "a", "b", - None], + "c"], dtype='object'), "float_value": np.array([0, np.NaN, - np.NaN, + 2, 0, np.NaN, - np.NaN, + 2, 0, 1, - np.NaN], + 2], dtype='float64'), "int_value": np.array([1, 0, - 0, + 3, 1, 0, 0, @@ -332,7 +503,7 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase): dtype='int64'), "bool_value": np.array([True, False, - False, + True, True, False, False, @@ -342,13 +513,13 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase): dtype='bool'), "dt_value": [pd.Timestamp('2011-01-01'), pd.NaT, - pd.NaT, + pd.Timestamp('2011-01-03'), pd.Timestamp('2011-01-01'), pd.NaT, - pd.NaT, + pd.Timestamp('2011-01-03'), pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), - pd.NaT], + pd.Timestamp('2011-01-03')], }, columns=['str_value', 'float_value', 'int_value', 'bool_value', 'dt_value'], diff --git a/zipline/pipeline/loaders/blaze/core.py b/zipline/pipeline/loaders/blaze/core.py index 28af4faf..d17612a4 100644 --- a/zipline/pipeline/loaders/blaze/core.py +++ b/zipline/pipeline/loaders/blaze/core.py @@ -179,7 +179,6 @@ from zipline.utils.input_validation import ( from zipline.utils.numpy_utils import ( categorical_dtype, repeat_last_axis, - datetime64ns_dtype ) from zipline.utils.pandas_utils import sort_values from zipline.utils.preprocess import preprocess @@ -1035,25 +1034,38 @@ class BlazeLoader(dict): sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False) dense_output = last_in_date_group(sparse_output, reindex=True) - dense_output = dense_output.ffill() + dense_output.ffill(inplace=True) - # Unstack will fill all missing values with NaN; we need to fix - # this for all types that are not float. + # Fill in missing values specified by each column. This is made + # significantly more complex by the fact that we need to work around + # two pandas issues: + + # 1) When we have sids, if there are no records for a given sid for any + # dates, pandas will generate a column full of NaNs for that sid. + # This means that some of the columns in `dense_output` are now + # float instead of the intended dtype, so we have to coerce back to + # our expected type and convert NaNs into the desired missing value. + + # 2) DataFrame.ffill assumes that receiving None as a fill-value means + # that no value was passed. Consequently, there's no way to tell + # pandas to replace NaNs in an object column with None using fillna, + # so we have to roll our own instead using df.where. for column in columns: + # Special logic for strings since `fillna` doesn't work if the + # missing value is `None`. if column.dtype == categorical_dtype: dense_output[column.name] = dense_output[ column.name ].where(pd.notnull(dense_output[column.name]), column.missing_value) - # Need to convert from float col to datetime col - elif column.dtype == datetime64ns_dtype: - dense_output[column.name] = dense_output[ - column.name - ].astype('datetime64[ns]') else: + # We need to execute `fillna` before `astype` in case the + # column contains NaNs and needs to be cast to bool or int. + # This is so that the NaNs are replaced first, since pandas + # can't convert NaNs for those types. dense_output[column.name] = dense_output[ column.name - ].fillna(column.missing_value) + ].fillna(column.missing_value).astype(column.dtype) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids