mirror of
https://github.com/wassname/catalyst.git
synced 2026-07-01 22:55:52 +08:00
BUG: sort data on asof_date to resolve ts conflicts
MAINT: fix arg default and update docstring
This commit is contained in:
@@ -776,8 +776,9 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase):
|
||||
check_dtype=False,
|
||||
)
|
||||
|
||||
def _test_id_macro(self, df, dshape, expected, finder, add):
|
||||
dates = self.dates
|
||||
def _test_id_macro(self, df, dshape, expected, finder, add, dates=None):
|
||||
if dates is None:
|
||||
dates = self.dates
|
||||
expr = bz.data(df, name='expr', dshape=dshape)
|
||||
loader = BlazeLoader()
|
||||
ds = from_blaze(
|
||||
@@ -1875,6 +1876,56 @@ class BlazeToPipelineTestCase(WithAssetFinder, ZiplineTestCase):
|
||||
|
||||
self._test_checkpoints(checkpoints)
|
||||
|
||||
def test_id_take_last_in_group_sorted(self):
|
||||
"""
|
||||
input
|
||||
asof_date timestamp other value
|
||||
2014-01-03 2014-01-04 00 3 3
|
||||
2014-01-02 2014-01-04 00 2 2
|
||||
|
||||
output (expected):
|
||||
|
||||
other value
|
||||
2014-01-02 NaN NaN
|
||||
2014-01-03 NaN NaN
|
||||
2014-01-06 3 3
|
||||
"""
|
||||
|
||||
dates = pd.DatetimeIndex([
|
||||
pd.Timestamp('2014-01-02'),
|
||||
pd.Timestamp('2014-01-03'),
|
||||
pd.Timestamp('2014-01-06'),
|
||||
])
|
||||
|
||||
T = pd.Timestamp
|
||||
df = pd.DataFrame(
|
||||
columns=['asof_date', 'timestamp', 'other', 'value'],
|
||||
data=[
|
||||
# asof-dates are flipped in terms of order so that if we
|
||||
# don't sort on asof-date before getting the last in group,
|
||||
# we will get the wrong result.
|
||||
[T('2014-01-03'), T('2014-01-04 00'), 3, 3],
|
||||
[T('2014-01-02'), T('2014-01-04 00'), 2, 2],
|
||||
],
|
||||
)
|
||||
fields = OrderedDict(self.macro_dshape.measure.fields)
|
||||
fields['other'] = fields['value']
|
||||
expected = pd.DataFrame(
|
||||
data=[[np.nan, np.nan], # 2014-01-02
|
||||
[np.nan, np.nan], # 2014-01-03
|
||||
[3, 3]], # 2014-01-06
|
||||
columns=['other', 'value'],
|
||||
index=dates,
|
||||
)
|
||||
self._test_id_macro(
|
||||
df,
|
||||
var * Record(fields),
|
||||
expected,
|
||||
self.asset_finder,
|
||||
('other', 'value'),
|
||||
dates=dates,
|
||||
)
|
||||
|
||||
|
||||
class MiscTestCase(ZiplineTestCase):
|
||||
def test_exprdata_repr(self):
|
||||
|
||||
@@ -1104,6 +1104,11 @@ class BlazeLoader(dict):
|
||||
materialized_deltas,
|
||||
dates,
|
||||
)
|
||||
# If we ever have cases where we find out about multiple asof_dates'
|
||||
# data on the same TS, we want to make sure that last_in_date_group
|
||||
# selects the correct last asof_date's value.
|
||||
sparse_output.sort_values(AD_FIELD_NAME, inplace=True)
|
||||
non_novel_deltas.sort_values(AD_FIELD_NAME, inplace=True)
|
||||
if AD_FIELD_NAME not in requested_columns:
|
||||
sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)
|
||||
|
||||
|
||||
@@ -281,15 +281,17 @@ def last_in_date_group(df,
|
||||
assets,
|
||||
reindex=True,
|
||||
have_sids=True,
|
||||
extra_groupers=[]):
|
||||
extra_groupers=None):
|
||||
"""
|
||||
Determine the last piece of information known on each date in the date
|
||||
index for each group.
|
||||
index for each group. Input df MUST be sorted such that the correct last
|
||||
item is chosen from each group.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : pd.DataFrame
|
||||
The DataFrame containing the data to be grouped.
|
||||
The DataFrame containing the data to be grouped. Must be sorted so that
|
||||
the correct last item is chosen from each group.
|
||||
dates : pd.DatetimeIndex
|
||||
The dates to use for grouping and reindexing.
|
||||
assets : pd.Int64Index
|
||||
@@ -316,6 +318,8 @@ def last_in_date_group(df,
|
||||
)]]
|
||||
if have_sids:
|
||||
idx += [SID_FIELD_NAME]
|
||||
if extra_groupers is None:
|
||||
extra_groupers = []
|
||||
idx += extra_groupers
|
||||
|
||||
last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby(
|
||||
|
||||
Reference in New Issue
Block a user