TST: add tests to ensure no forward filling of non-missing values

STY: fix indentation

DOC: add docs to clarify test input/output
This commit is contained in:
Maya Tydykov
2016-05-18 15:48:04 -04:00
parent 3a3c7db844
commit e5039a43b0
2 changed files with 212 additions and 29 deletions
+190 -19
View File
@@ -270,19 +270,190 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase):
NonPipelineField,
)
def test_cols_with_missing_vals(self):
dates = (self.dates[0], self.dates[-1])
def test_cols_with_all_missing_vals(self):
"""
Tests that when there is no known data, we get output where the
columns have the right dtypes and the right missing values filled in.
input (self.df):
Empty DataFrame
Columns: [sid, float_value, str_value, int_value, bool_value, dt_value,
asof_date, timestamp]
Index: []
output (expected)
str_value float_value int_value
2014-01-01 Equity(65 [A]) None NaN 0
Equity(66 [B]) None NaN 0
Equity(67 [C]) None NaN 0
2014-01-02 Equity(65 [A]) None NaN 0
Equity(66 [B]) None NaN 0
Equity(67 [C]) None NaN 0
2014-01-03 Equity(65 [A]) None NaN 0
Equity(66 [B]) None NaN 0
Equity(67 [C]) None NaN 0
dt_value bool_value
2014-01-01 Equity(65 [A]) NaT False
Equity(66 [B]) NaT False
Equity(67 [C]) NaT False
2014-01-02 Equity(65 [A]) NaT False
Equity(66 [B]) NaT False
Equity(67 [C]) NaT False
2014-01-03 Equity(65 [A]) NaT False
Equity(66 [B]) NaT False
Equity(67 [C]) NaT False
"""
df = pd.DataFrame(columns=['sid', 'float_value', 'str_value',
'int_value', 'bool_value', 'dt_value',
'asof_date', 'timestamp'])
expr = bz.data(
df,
dshape="""
var * {
sid: int64,
float_value: float64,
str_value: string,
int_value: int64,
bool_value: bool,
dt_value: datetime,
asof_date: datetime,
timestamp: datetime,
}""",
)
fields = OrderedDict(expr.dshape.measure.fields)
expected = pd.DataFrame({
"str_value": np.array([None,
None,
None,
None,
None,
None,
None,
None,
None],
dtype='object'),
"float_value": np.array([np.NaN,
np.NaN,
np.NaN,
np.NaN,
np.NaN,
np.NaN,
np.NaN,
np.NaN,
np.NaN],
dtype='float64'),
"int_value": np.array([0,
0,
0,
0,
0,
0,
0,
0,
0],
dtype='int64'),
"bool_value": np.array([False,
False,
False,
False,
False,
False,
False,
False,
False],
dtype='bool'),
"dt_value": [pd.NaT,
pd.NaT,
pd.NaT,
pd.NaT,
pd.NaT,
pd.NaT,
pd.NaT,
pd.NaT,
pd.NaT],
},
columns=['str_value', 'float_value', 'int_value', 'bool_value',
'dt_value'],
index=pd.MultiIndex.from_product(
(self.dates, self.asset_finder.retrieve_all(
self.ASSET_FINDER_EQUITY_SIDS
))
)
)
self._test_id(
df,
var * Record(fields),
expected,
self.asset_finder,
('float_value', 'str_value', 'int_value', 'bool_value',
'dt_value'),
)
def test_cols_with_some_missing_vals(self):
"""
Tests the following:
1) Forward filling replaces missing values correctly for the data
types supported in pipeline.
2) We don't forward fill when the missing value is the actual value
we got for a date in the case of int/bool columns.
3) We get the correct type of missing value in the output.
input (self.df):
asof_date bool_value dt_value float_value int_value sid
0 2014-01-01 True 2011-01-01 0 1 65
1 2014-01-03 True 2011-01-02 1 2 66
2 2014-01-01 True 2011-01-03 2 3 67
3 2014-01-02 False NaT NaN 0 67
str_value timestamp
0 a 2014-01-01
1 b 2014-01-03
2 c 2014-01-01
3 None 2014-01-02
output (expected)
str_value float_value int_value bool_value
2014-01-01 Equity(65 [A]) a 0 1 True
Equity(66 [B]) None NaN 0 False
Equity(67 [C]) c 2 3 True
2014-01-02 Equity(65 [A]) a 0 1 True
Equity(66 [B]) None NaN 0 False
Equity(67 [C]) c 2 0 False
2014-01-03 Equity(65 [A]) a 0 1 True
Equity(66 [B]) b 1 2 True
Equity(67 [C]) c 2 0 False
dt_value
2014-01-01 Equity(65 [A]) 2011-01-01
Equity(66 [B]) NaT
Equity(67 [C]) 2011-01-03
2014-01-02 Equity(65 [A]) 2011-01-01
Equity(66 [B]) NaT
Equity(67 [C]) 2011-01-03
2014-01-03 Equity(65 [A]) 2011-01-01
Equity(66 [B]) 2011-01-02
Equity(67 [C]) 2011-01-03
"""
dates = (self.dates[0], self.dates[-1], self.dates[0], self.dates[1])
df = pd.DataFrame({
'sid': self.ASSET_FINDER_EQUITY_SIDS[:-1],
'float_value': (0., 1.,),
'str_value': ("a", "b",),
'int_value': (1, 2),
'bool_value': (True, True),
'sid': self.ASSET_FINDER_EQUITY_SIDS[:-1] +
(self.ASSET_FINDER_EQUITY_SIDS[-1],)*2,
'float_value': (0., 1., 2., np.NaN),
'str_value': ("a", "b", "c", None),
'int_value': (1, 2, 3, 0),
'bool_value': (True, True, True, False),
'dt_value': (pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02')),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-03'),
pd.NaT),
'asof_date': dates,
'timestamp': dates,
})
expr = bz.data(
df,
dshape="""
@@ -302,27 +473,27 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase):
expected = pd.DataFrame({
"str_value": np.array(["a",
None,
None,
"c",
"a",
None,
None,
"c",
"a",
"b",
None],
"c"],
dtype='object'),
"float_value": np.array([0,
np.NaN,
np.NaN,
2,
0,
np.NaN,
np.NaN,
2,
0,
1,
np.NaN],
2],
dtype='float64'),
"int_value": np.array([1,
0,
0,
3,
1,
0,
0,
@@ -332,7 +503,7 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase):
dtype='int64'),
"bool_value": np.array([True,
False,
False,
True,
True,
False,
False,
@@ -342,13 +513,13 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase):
dtype='bool'),
"dt_value": [pd.Timestamp('2011-01-01'),
pd.NaT,
pd.NaT,
pd.Timestamp('2011-01-03'),
pd.Timestamp('2011-01-01'),
pd.NaT,
pd.NaT,
pd.Timestamp('2011-01-03'),
pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02'),
pd.NaT],
pd.Timestamp('2011-01-03')],
},
columns=['str_value', 'float_value', 'int_value', 'bool_value',
'dt_value'],
+22 -10
View File
@@ -179,7 +179,6 @@ from zipline.utils.input_validation import (
from zipline.utils.numpy_utils import (
categorical_dtype,
repeat_last_axis,
datetime64ns_dtype
)
from zipline.utils.pandas_utils import sort_values
from zipline.utils.preprocess import preprocess
@@ -1035,25 +1034,38 @@ class BlazeLoader(dict):
sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False)
dense_output = last_in_date_group(sparse_output, reindex=True)
dense_output = dense_output.ffill()
dense_output.ffill(inplace=True)
# Unstack will fill all missing values with NaN; we need to fix
# this for all types that are not float.
# Fill in missing values specified by each column. This is made
# significantly more complex by the fact that we need to work around
# two pandas issues:
# 1) When we have sids, if there are no records for a given sid for any
# dates, pandas will generate a column full of NaNs for that sid.
# This means that some of the columns in `dense_output` are now
# float instead of the intended dtype, so we have to coerce back to
# our expected type and convert NaNs into the desired missing value.
# 2) DataFrame.ffill assumes that receiving None as a fill-value means
# that no value was passed. Consequently, there's no way to tell
# pandas to replace NaNs in an object column with None using fillna,
# so we have to roll our own instead using df.where.
for column in columns:
# Special logic for strings since `fillna` doesn't work if the
# missing value is `None`.
if column.dtype == categorical_dtype:
dense_output[column.name] = dense_output[
column.name
].where(pd.notnull(dense_output[column.name]),
column.missing_value)
# Need to convert from float col to datetime col
elif column.dtype == datetime64ns_dtype:
dense_output[column.name] = dense_output[
column.name
].astype('datetime64[ns]')
else:
# We need to execute `fillna` before `astype` in case the
# column contains NaNs and needs to be cast to bool or int.
# This is so that the NaNs are replaced first, since pandas
# can't convert NaNs for those types.
dense_output[column.name] = dense_output[
column.name
].fillna(column.missing_value)
].fillna(column.missing_value).astype(column.dtype)
if have_sids:
adjustments_from_deltas = adjustments_from_deltas_with_sids