From 6c6a33c73b7545f96e6217e9fdccf1fbad90cb22 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Fri, 5 Aug 2016 11:53:29 -0400 Subject: [PATCH 01/16] ENH: add loader for estimates --- tests/pipeline/test_quarters_estimates.py | 14 ++ zipline/pipeline/common.py | 2 + zipline/pipeline/loaders/blaze/estimates.py | 146 ++++++++++++ zipline/pipeline/loaders/blaze/events.py | 32 +-- zipline/pipeline/loaders/events.py | 12 +- zipline/pipeline/loaders/quarter_estimates.py | 221 ++++++++++++++++++ zipline/pipeline/loaders/utils.py | 32 +++ 7 files changed, 420 insertions(+), 39 deletions(-) create mode 100644 tests/pipeline/test_quarters_estimates.py create mode 100644 zipline/pipeline/loaders/blaze/estimates.py create mode 100644 zipline/pipeline/loaders/quarter_estimates.py diff --git a/tests/pipeline/test_quarters_estimates.py b/tests/pipeline/test_quarters_estimates.py new file mode 100644 index 00000000..201011ea --- /dev/null +++ b/tests/pipeline/test_quarters_estimates.py @@ -0,0 +1,14 @@ +def test_shift_quarters_forward(): + quarters = list(range(1, 5)) + shifts = list(range(5)) + expected = [(x, i) for ] + expected = ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), + (0, 2), (0, 3), (0, 4), (1, 1), (1, 2)) + for quarter in quarters: + for shift in shifts: + yrs_to_shift, new_qtr = EstimizeLoader.calc_forward_shift(quarter, + shift) + if quarter + shift <= 4: + assert yrs_to_shift == 0 + assert new_qtr == quarter + shift + else: diff --git a/zipline/pipeline/common.py b/zipline/pipeline/common.py index e64b1dc9..7b48e271 100644 --- a/zipline/pipeline/common.py +++ b/zipline/pipeline/common.py @@ -6,6 +6,8 @@ ANNOUNCEMENT_FIELD_NAME = 'announcement_date' CASH_FIELD_NAME = 'cash' DAYS_SINCE_PREV = 'days_since_prev' DAYS_TO_NEXT = 'days_to_next' +FISCAL_QUARTER_FIELD_NAME = 'fiscal_quarter' +FISCAL_YEAR_FIELD_NAME = 'fiscal_year' NEXT_ANNOUNCEMENT = 'next_announcement' PREVIOUS_AMOUNT = 'previous_amount' PREVIOUS_ANNOUNCEMENT = 'previous_announcement' diff --git a/zipline/pipeline/loaders/blaze/estimates.py b/zipline/pipeline/loaders/blaze/estimates.py new file mode 100644 index 00000000..6a89fa7b --- /dev/null +++ b/zipline/pipeline/loaders/blaze/estimates.py @@ -0,0 +1,146 @@ +from datashape import istabular + +from .core import ( + bind_expression_to_resources, + ffill_query_in_range, +) +from zipline.pipeline.loaders.base import PipelineLoader +from zipline.pipeline.loaders.events import ( + EventsLoader, + required_event_fields, +) +from zipline.pipeline.common import ( + SID_FIELD_NAME, + TS_FIELD_NAME, +) +from zipline.pipeline.loaders.quarter_estimates import \ + NextQuartersEstimatesLoader, PreviousQuartersEstimatesLoader +from zipline.pipeline.loaders.utils import ( + check_data_query_args, + normalize_data_query_bounds, + normalize_timestamp_to_query_time, + load_raw_data) +from zipline.utils.input_validation import ensure_timezone, optionally +from zipline.utils.preprocess import preprocess + + +class BlazeEstimatesLoader(PipelineLoader): + """An abstract pipeline loader for the estimates datasets that loads + data from a blaze expression. + + Parameters + ---------- + expr : Expr + The expression representing the data to load. + resources : dict, optional + Mapping from the loadable terms of ``expr`` to actual data resources. + odo_kwargs : dict, optional + Extra keyword arguments to pass to odo when executing the expression. + data_query_time : time, optional + The time to use for the data query cutoff. + data_query_tz : tzinfo or str + The timezeone to use for the data query cutoff. + dataset : DataSet + The DataSet object for which this loader loads data. + + Notes + ----- + The expression should have a tabular dshape of:: + + Dim * {{ + {SID_FIELD_NAME}: int64, + {TS_FIELD_NAME}: datetime, + }} + + And other dataset-specific fields, where each row of the table is a + record including the sid to identify the company, the timestamp where we + learned about the announcement, and the date when the earnings will be z + announced. + + If the '{TS_FIELD_NAME}' field is not included it is assumed that we + start the backtest with knowledge of all announcements. + """ + + @preprocess(data_query_tz=optionally(ensure_timezone)) + def __init__(self, + expr, + columns, + resources=None, + odo_kwargs=None, + data_query_time=None, + data_query_tz=None, + loader=None): + + dshape = expr.dshape + if not istabular(dshape): + raise ValueError( + 'expression dshape must be tabular, got: %s' % dshape, + ) + + required_cols = list( + required_event_fields(columns) + ) + self._expr = bind_expression_to_resources( + expr[required_cols], + resources, + ) + self._columns = columns + self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {} + check_data_query_args(data_query_time, data_query_tz) + self._data_query_time = data_query_time + self._data_query_tz = data_query_tz + self.loader = loader + + def load_adjusted_array(self, columns, dates, assets, mask): + raw = load_raw_data(assets, dates, self._data_query_time, + self._data_query_tz, self._exp, self._odo_kwargs) + + return self.loader( + events=raw, + next_value_columns=self._columns, + ).load_adjusted_array( + columns, + dates, + assets, + mask, + ) + + +class BlazeNextEstimatesLoader(BlazeEstimatesLoader): + loader = NextQuartersEstimatesLoader + + def __init__(self, + expr, + columns, + resources=None, + odo_kwargs=None, + data_query_time=None, + data_query_tz=None, + loader=None): + super(BlazeNextEstimatesLoader).__init__(expr, + columns, + resources, + odo_kwargs, + data_query_time, + data_query_tz, + loader) + + +class BlazePreviousEstimatesLoader(BlazeEstimatesLoader): + loader = PreviousQuartersEstimatesLoader + + def __init__(self, + expr, + columns, + resources=None, + odo_kwargs=None, + data_query_time=None, + data_query_tz=None, + loader=None): + super(BlazeNextEstimatesLoader).__init__(expr, + columns, + resources, + odo_kwargs, + data_query_time, + data_query_tz, + loader) diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index c71646cb..4165166b 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -17,7 +17,7 @@ from zipline.pipeline.loaders.utils import ( check_data_query_args, normalize_data_query_bounds, normalize_timestamp_to_query_time, -) + load_raw_data) from zipline.utils.input_validation import ensure_timezone, optionally from zipline.utils.preprocess import preprocess @@ -90,34 +90,8 @@ class BlazeEventsLoader(PipelineLoader): self._data_query_tz = data_query_tz def load_adjusted_array(self, columns, dates, assets, mask): - data_query_time = self._data_query_time - data_query_tz = self._data_query_tz - lower_dt, upper_dt = normalize_data_query_bounds( - dates[0], - dates[-1], - data_query_time, - data_query_tz, - ) - - raw = ffill_query_in_range( - self._expr, - lower_dt, - upper_dt, - self._odo_kwargs, - ) - sids = raw.loc[:, SID_FIELD_NAME] - raw.drop( - sids[~sids.isin(assets)].index, - inplace=True - ) - if data_query_time is not None: - normalize_timestamp_to_query_time( - raw, - data_query_time, - data_query_tz, - inplace=True, - ts_field=TS_FIELD_NAME, - ) + raw = load_raw_data(assets, dates, self._data_query_time, + self._data_query_tz, self._expr, self._odo_kwargs) return EventsLoader( events=raw, diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index 645143d8..1c49779c 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -41,16 +41,8 @@ def validate_column_specs(events, next_value_columns, previous_value_columns): serve the BoundColumns described by ``next_value_columns`` and ``previous_value_columns``. """ - required = { - TS_FIELD_NAME, - SID_FIELD_NAME, - EVENT_DATE_FIELD_NAME, - }.union( - # We also expect any of the field names that our loadable columns - # are mapped to. - viewvalues(next_value_columns), - viewvalues(previous_value_columns), - ) + required = required_event_fields(next_value_columns, + previous_value_columns) received = set(events.columns) missing = required - received if missing: diff --git a/zipline/pipeline/loaders/quarter_estimates.py b/zipline/pipeline/loaders/quarter_estimates.py new file mode 100644 index 00000000..495bc075 --- /dev/null +++ b/zipline/pipeline/loaders/quarter_estimates.py @@ -0,0 +1,221 @@ +from itertools import groupby +import numpy as np +import pandas as pd +from six import viewvalues +from zipline.pipeline.common import AD_FIELD_NAME, SID_FIELD_NAME, \ + EVENT_DATE_FIELD_NAME, FISCAL_QUARTER_FIELD_NAME, FISCAL_YEAR_FIELD_NAME +from zipline.pipeline.loaders.base import PipelineLoader +from zipline.pipeline.loaders.frame import DataFrameLoader + + +def required_event_fields(columns): + """ + Compute the set of resource columns required to serve + ``next_value_columns`` and ``previous_value_columns``. + """ + # These metadata columns are used to align event indexers. + return { + AD_FIELD_NAME, + SID_FIELD_NAME, + EVENT_DATE_FIELD_NAME, + FISCAL_QUARTER_FIELD_NAME, + FISCAL_YEAR_FIELD_NAME + }.union( + # We also expect any of the field names that our loadable columns + # are mapped to. + viewvalues(columns), + ) + + +def validate_column_specs(events, columns): + """ + Verify that the columns of ``events`` can be used by an EventsLoader to + serve the BoundColumns described by ``next_value_columns`` and + ``previous_value_columns``. + """ + required = required_event_fields(columns) + received = set(events.columns) + missing = required - received + if missing: + raise ValueError( + "EventsLoader missing required columns {missing}.\n" + "Got Columns: {received}\n" + "Expected Columns: {required}".format( + missing=sorted(missing), + received=sorted(received), + required=sorted(required), + ) + ) + + +def calc_forward_shift(qtr, num_shifts): + yrs_to_shift, new_qtr = divmod(qtr + num_shifts, 4) + if yrs_to_shift == 1 and new_qtr == 0: + yrs_to_shift = 0 + new_qtr = 4 + return yrs_to_shift, new_qtr + + +def calc_backward_shift(qtr, num_shifts): + yrs_to_shift, new_qtr = divmod(abs(num_shifts - qtr), 4) + if yrs_to_shift == 0 and new_qtr == 0: + yrs_to_shift = 1 + new_qtr = 4 + yrs_to_shift = -yrs_to_shift + return yrs_to_shift, new_qtr + + +class QuarterEstimatesLoader(PipelineLoader): + def __init__(self, + events, + columns): + validate_column_specs( + events, + columns + ) + + self.events = events[ + events[EVENT_DATE_FIELD_NAME].notnull() and + events[FISCAL_QUARTER_FIELD_NAME].notnull() and + events[FISCAL_YEAR_FIELD_NAME].notnull() + ] + + self.columns = columns + + def load_quarters(self, next_releases, num_quarters, dates_sids, gb): + pass + + def load_adjusted_array(self, columns, dates, assets, mask): + groups = groupby(lambda x: x.dataset.num_quarters, columns) + out = {} + date_values = pd.DataFrame(dates, columns=['dates']) + date_values['key'] = 1 + self.events['key'] = 1 + merged = pd.merge(date_values, self.events, on='key') + asset_df = pd.DataFrame(assets, columns=['sid']) + asset_df['key'] = 1 + dates_sids = pd.merge(date_values, asset_df, on='key') + for num_quarters in groups: + columns = groups[num_quarters] + # First, group by sid, fiscal year, and fiscal quarter and only + # keep the last estimate made. + final_releases_per_qtr = merged[merged.asof_date <= + merged.dates].sort( + ['dates', 'asof_date'] + ).groupby( + ['dates', 'sid', 'fiscal_year', 'fiscal_quarter'] + ).last() + gb = final_releases_per_qtr.reset_index().groupby(['dates', 'sid']) + # Split the date-sid combinations into ones with a next release + # and ones without + eligible_next_releases = pd.concat([group[1] for group in gb if ( + group[1][EVENT_DATE_FIELD_NAME] >= group[1]['dates'] + ).any()]) + + eligible_next_releases.sort(EVENT_DATE_FIELD_NAME) + # For each sid, get the next release/year/quarter that we care + # about. + next_releases = eligible_next_releases.groupby( + ['dates', 'sid'] + ).min() + next_releases = next_releases.rename( + columns={'fiscal_year': 'next_fiscal_year', + 'fiscal_quarter': 'next_fiscal_quarter'} + ) + + result = self.load_quarters(next_releases, + num_quarters, + dates_sids) + + for c in columns: + column_name = self.columns[c.name] + # Need to pass a DataFrame that has dates as the index and + # all sids as columns with column values being the value in + # 'result' for column c + loader = DataFrameLoader( + c, + result.pivot(index='dates', + columns='sid', + values=column_name), + adjustments=None + ) + out[c] = loader.load_adjusted_array([c], dates, assets, mask)[c] + return out + + +class NextQuartersEstimatesLoader(QuarterEstimatesLoader): + def __init__(self, + events, + columns): + super(NextQuartersEstimatesLoader).__init__(events, columns) + + def load_quarters(self, next_releases, num_quarters, dates_sids, gb): + # `next_qtr` is already the next quarter over, + # so we should offest `num_shifts` by 1. + next_releases['fiscal_quarter'] = next_releases.apply( + lambda x: calc_forward_shift(x['next_fiscal_quarter'], + num_quarters - 1)[1], + axis=1 + ) + next_releases['fiscal_year'] = next_releases.apply( + lambda x: + x['next_fiscal_year'] + + calc_forward_shift(x['next_fiscal_quarter'], + num_quarters - 1)[0], + axis=1 + ) + # Merge to get the rows we care about for each date + result = dates_sids.merge(next_releases.reset_index(), + on=(['dates', 'sid']), + how='left') + return result + + +class PreviousQuartersEstimatesLoader(QuarterEstimatesLoader): + def __init__(self, + events, + columns): + super(PreviousQuartersEstimatesLoader).__init__(events, columns) + + def load_quarters(self, next_releases, num_quarters, dates_sids, gb): + next_releases['fiscal_quarter'] = next_releases.apply( + lambda x: calc_backward_shift(x['next_fiscal_quarter'], + num_quarters)[1], + axis=1 + ) + next_releases['fiscal_year'] = next_releases.apply( + lambda x: + x['next_fiscal_year'] + + calc_backward_shift(x['next_fiscal_quarter'], + num_quarters)[0], + axis=1 + ) + only_previous_releases = pd.concat([group[1] for group in gb if ( + group[1][EVENT_DATE_FIELD_NAME] < group[1]['dates'] + ).all()]) + only_previous_releases.sort(EVENT_DATE_FIELD_NAME) + # For each sid, get the latest release we knew about prior to + # each simulation date. + previous_releases = only_previous_releases.groupby(['dates', + 'sid']).max() + previous_releases = previous_releases.rename(columns={ + 'fiscal_year': 'previous_fiscal_year', + 'fiscal_quarter': 'previous_fiscal_quarter' + }) + previous_releases['fiscal_quarter'] = previous_releases.apply( + lambda x: calc_backward_shift(x['previous_fiscal_quarter'], + num_quarters)[1], + axis=1 + ) + previous_releases['fiscal_year'] = previous_releases.apply( + lambda x: + x['previous_fiscal_year'] + + calc_backward_shift(x['previous_fiscal_quarter'], + num_quarters)[0], + axis=1 + ) + all_releases = pd.concat([next_releases, previous_releases]) + # Merge to get the rows we care about for each date + result = dates_sids.merge(all_releases.reset_index(), + on=(['dates', 'sid']), how='left') + return result diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 77a9f447..2f388810 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -2,6 +2,8 @@ import datetime import numpy as np import pandas as pd +from zipline.pipeline.common import TS_FIELD_NAME, SID_FIELD_NAME +from zipline.pipeline.loaders.blaze.core import ffill_query_in_range from zipline.utils.pandas_utils import mask_between_time @@ -272,3 +274,33 @@ def check_data_query_args(data_query_time, data_query_tz): data_query_tz, ), ) + + +def load_raw_data(assets, dates, data_query_time, data_query_tz, expr, + odo_kwargs): + lower_dt, upper_dt = normalize_data_query_bounds( + dates[0], + dates[-1], + data_query_time, + data_query_tz, + ) + raw = ffill_query_in_range( + expr, + lower_dt, + upper_dt, + odo_kwargs, + ) + sids = raw.loc[:, SID_FIELD_NAME] + raw.drop( + sids[~sids.isin(assets)].index, + inplace=True + ) + if data_query_time is not None: + normalize_timestamp_to_query_time( + raw, + data_query_time, + data_query_tz, + inplace=True, + ts_field=TS_FIELD_NAME, + ) + return raw From 6454fca6dca2e0248d98df4f62466d540d926827 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Tue, 9 Aug 2016 09:52:09 -0400 Subject: [PATCH 02/16] TST: add tests for quarter estimates MAINT: modify algorithm for calculating previous releases BUG: fix quarter calculation logic --- tests/pipeline/test_quarters_estimates.py | 190 ++++++++++++++++-- zipline/pipeline/loaders/blaze/core.py | 32 ++- zipline/pipeline/loaders/blaze/estimates.py | 11 +- zipline/pipeline/loaders/blaze/events.py | 12 +- zipline/pipeline/loaders/quarter_estimates.py | 135 ++++++------- zipline/pipeline/loaders/utils.py | 86 +++++--- 6 files changed, 326 insertions(+), 140 deletions(-) diff --git a/tests/pipeline/test_quarters_estimates.py b/tests/pipeline/test_quarters_estimates.py index 201011ea..02666964 100644 --- a/tests/pipeline/test_quarters_estimates.py +++ b/tests/pipeline/test_quarters_estimates.py @@ -1,14 +1,178 @@ -def test_shift_quarters_forward(): - quarters = list(range(1, 5)) - shifts = list(range(5)) - expected = [(x, i) for ] - expected = ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), - (0, 2), (0, 3), (0, 4), (1, 1), (1, 2)) - for quarter in quarters: - for shift in shifts: - yrs_to_shift, new_qtr = EstimizeLoader.calc_forward_shift(quarter, - shift) - if quarter + shift <= 4: - assert yrs_to_shift == 0 - assert new_qtr == quarter + shift +from itertools import product +import numpy as np +import pandas as pd +from zipline.pipeline import SimplePipelineEngine, Pipeline + +from zipline.pipeline.data import DataSet, Column +from zipline.pipeline.loaders.quarter_estimates import \ + NextQuartersEstimatesLoader, PreviousQuartersEstimatesLoader +from zipline.testing import ZiplineTestCase +from zipline.testing.fixtures import WithAssetFinder, WithTradingSessions +from zipline.testing.predicates import assert_equal +from zipline.utils.numpy_utils import datetime64ns_dtype, float64_dtype + + +class Estimates(DataSet): + event_date = Column(dtype=datetime64ns_dtype) + fiscal_quarter = Column(dtype=float64_dtype) + fiscal_year = Column(dtype=float64_dtype) + estimate = Column(dtype=float64_dtype) + value = Column(dtype=float64_dtype) + + +def QuartersEstimates(num_qtr): + class QtrEstimates(Estimates): + num_quarters = num_qtr + name=Estimates + return QtrEstimates + +# Final release dates never change +releases = pd.DataFrame({ + 'sid': [1, 1], + 'timestamp': [pd.Timestamp('2015-01-20'), pd.Timestamp('2015-4-20')], + 'event_date': [pd.Timestamp('2015-01-20'), pd.Timestamp('2015-04-20')], + 'estimate': [0.5, 0.8], + 'value': [0.6, 0.9], + 'fiscal_quarter': [1, 2], + 'fiscal_year': [2015, 2015] +}) + +estimates = pd.DataFrame({ + 'sid': [1, 1, 1, 1], + 'timestamp': [pd.Timestamp('2015-01-02'), + pd.Timestamp('2015-01-10'), + pd.Timestamp('2015-04-02'), + pd.Timestamp('2015-4-10')], + 'event_date': [pd.Timestamp('2015-01-20'), + pd.Timestamp('2015-01-20'), + pd.Timestamp('2015-04-20'), + pd.Timestamp('2015-04-20')], + 'estimate': [.1, .2, .3, .4], + 'value': [np.NaN, np.NaN, np.NaN, np.NaN], + 'fiscal_quarter': [1, 1, 2, 2], + 'fiscal_year': [2015, 2015, 2015, 2015] +}) + +events = pd.concat([releases, estimates]) + + +class NextEstimateTestCase(WithAssetFinder, + WithTradingSessions, + ZiplineTestCase): + START_DATE = pd.Timestamp('2015-01-01') + END_DATE = pd.Timestamp('2015-04-30') + + @classmethod + def make_loader(cls, events, columns): + return NextQuartersEstimatesLoader(events, columns) + + @classmethod + def init_class_fixtures(cls): + cls.events = events + cls.columns = { + Estimates.estimate: 'estimate', + Estimates.event_date: 'event_date', + Estimates.fiscal_quarter: 'fiscal_quarter', + Estimates.fiscal_year: 'fiscal_year', + Estimates.value: 'value', + } + cls.loader = cls.make_loader( + events=cls.events, + columns=cls.columns + ) + cls.ASSET_FINDER_EQUITY_SIDS = list(cls.events['sid'].unique()) + cls.ASSET_FINDER_EQUITY_SYMBOLS = [ + 's' + str(n) for n in cls.ASSET_FINDER_EQUITY_SIDS + ] + super(NextEstimateTestCase, cls).init_class_fixtures() + + def test_regular(self): + dataset = QuartersEstimates(1) + engine = SimplePipelineEngine( + lambda x: self.loader, + self.trading_days, + self.asset_finder, + ) + + results = engine.run_pipeline( + Pipeline({c.name: c.latest for c in dataset.columns}), + start_date=self.trading_days[0], + end_date=self.trading_days[-1], + ) + sid_events = results.xs(1, level=1) + ed_sorted_events = self.events.sort(['event_date', 'timestamp']) + for i, date in enumerate(sid_events.index): + # Get all upcoming events that we know about on 'date' + eligible_timestamps = ed_sorted_events[ed_sorted_events['timestamp'] + <= date] + eligible_events = eligible_timestamps[eligible_timestamps['event_date'] >= date] + if not eligible_events.empty: + smallest_event_date = eligible_events.iloc[0]['event_date'] + expected_event = eligible_events[eligible_events['event_date'] == smallest_event_date].iloc[-1] + for colname in sid_events.columns: + expected_value = expected_event[colname] + computed_value = sid_events.iloc[i][colname] + assert_equal(expected_value, computed_value) else: + assert sid_events.iloc[i].isnull().all() + + +class PreviousEstimateTestCase(WithAssetFinder, + WithTradingSessions, + ZiplineTestCase): + START_DATE = pd.Timestamp('2015-01-01') + END_DATE = pd.Timestamp('2015-04-30') + + @classmethod + def make_loader(cls, events, columns): + return PreviousQuartersEstimatesLoader(events, columns) + + @classmethod + def init_class_fixtures(cls): + cls.events = events + cls.columns = { + Estimates.estimate: 'estimate', + Estimates.event_date: 'event_date', + Estimates.fiscal_quarter: 'fiscal_quarter', + Estimates.fiscal_year: 'fiscal_year', + Estimates.value: 'value', + } + cls.loader = cls.make_loader( + events=cls.events, + columns=cls.columns + ) + cls.ASSET_FINDER_EQUITY_SIDS = list(cls.events['sid'].unique()) + cls.ASSET_FINDER_EQUITY_SYMBOLS = [ + 's' + str(n) for n in cls.ASSET_FINDER_EQUITY_SIDS + ] + super(PreviousEstimateTestCase, cls).init_class_fixtures() + + def test_regular(self): + dataset = QuartersEstimates(1) + engine = SimplePipelineEngine( + lambda x: self.loader, + self.trading_days, + self.asset_finder, + ) + + results = engine.run_pipeline( + Pipeline({c.name: c.latest for c in dataset.columns}), + start_date=self.trading_days[0], + end_date=self.trading_days[-1], + ) + sid_events = results.xs(1, level=1) + ed_sorted_events = self.events.sort(['event_date', 'timestamp']) + for i, date in enumerate(sid_events.index): + # Filter for events that happened on or before the simulation + # date and that we knew about on or before the simulation date. + ed_eligible_events = ed_sorted_events[ed_sorted_events['event_date'] <= date] + ts_eligible_events = ed_eligible_events[ed_eligible_events['timestamp'] <= date] + if not ts_eligible_events.empty: + # The expected event is the one we knew about last. + expected_event = ts_eligible_events.iloc[-1] + for colname in sid_events.columns: + expected_value = expected_event[colname] + computed_value = sid_events.iloc[i][colname] + assert_equal(expected_value, computed_value) + else: + assert sid_events.iloc[i].isnull().all() diff --git a/zipline/pipeline/loaders/blaze/core.py b/zipline/pipeline/loaders/blaze/core.py index 01e1e659..fc3252e4 100644 --- a/zipline/pipeline/loaders/blaze/core.py +++ b/zipline/pipeline/loaders/blaze/core.py @@ -1219,6 +1219,36 @@ def bind_expression_to_resources(expr, resources): }) +def load_raw_data(assets, dates, data_query_time, data_query_tz, expr, + odo_kwargs): + lower_dt, upper_dt = normalize_data_query_bounds( + dates[0], + dates[-1], + data_query_time, + data_query_tz, + ) + raw = ffill_query_in_range( + expr, + lower_dt, + upper_dt, + odo_kwargs, + ) + sids = raw.loc[:, SID_FIELD_NAME] + raw.drop( + sids[~sids.isin(assets)].index, + inplace=True + ) + if data_query_time is not None: + normalize_timestamp_to_query_time( + raw, + data_query_time, + data_query_tz, + inplace=True, + ts_field=TS_FIELD_NAME, + ) + return raw + + def ffill_query_in_range(expr, lower, upper, @@ -1273,4 +1303,4 @@ def ffill_query_in_range(expr, **odo_kwargs ) raw.loc[:, ts_field] = raw.loc[:, ts_field].astype('datetime64[ns]') - return raw + return raw \ No newline at end of file diff --git a/zipline/pipeline/loaders/blaze/estimates.py b/zipline/pipeline/loaders/blaze/estimates.py index 6a89fa7b..c2341164 100644 --- a/zipline/pipeline/loaders/blaze/estimates.py +++ b/zipline/pipeline/loaders/blaze/estimates.py @@ -2,24 +2,17 @@ from datashape import istabular from .core import ( bind_expression_to_resources, - ffill_query_in_range, + load_raw_data, ) from zipline.pipeline.loaders.base import PipelineLoader from zipline.pipeline.loaders.events import ( - EventsLoader, required_event_fields, ) -from zipline.pipeline.common import ( - SID_FIELD_NAME, - TS_FIELD_NAME, -) from zipline.pipeline.loaders.quarter_estimates import \ NextQuartersEstimatesLoader, PreviousQuartersEstimatesLoader from zipline.pipeline.loaders.utils import ( check_data_query_args, - normalize_data_query_bounds, - normalize_timestamp_to_query_time, - load_raw_data) +) from zipline.utils.input_validation import ensure_timezone, optionally from zipline.utils.preprocess import preprocess diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index 4165166b..d5ac8e37 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -2,22 +2,14 @@ from datashape import istabular from .core import ( bind_expression_to_resources, - ffill_query_in_range, + load_raw_data, ) from zipline.pipeline.loaders.base import PipelineLoader from zipline.pipeline.loaders.events import ( EventsLoader, required_event_fields, ) -from zipline.pipeline.common import ( - SID_FIELD_NAME, - TS_FIELD_NAME, -) -from zipline.pipeline.loaders.utils import ( - check_data_query_args, - normalize_data_query_bounds, - normalize_timestamp_to_query_time, - load_raw_data) +from zipline.pipeline.loaders.utils import check_data_query_args from zipline.utils.input_validation import ensure_timezone, optionally from zipline.utils.preprocess import preprocess diff --git a/zipline/pipeline/loaders/quarter_estimates.py b/zipline/pipeline/loaders/quarter_estimates.py index 495bc075..6fb2d5d8 100644 --- a/zipline/pipeline/loaders/quarter_estimates.py +++ b/zipline/pipeline/loaders/quarter_estimates.py @@ -1,11 +1,17 @@ -from itertools import groupby -import numpy as np import pandas as pd from six import viewvalues -from zipline.pipeline.common import AD_FIELD_NAME, SID_FIELD_NAME, \ - EVENT_DATE_FIELD_NAME, FISCAL_QUARTER_FIELD_NAME, FISCAL_YEAR_FIELD_NAME +from toolz import groupby +from zipline.pipeline.common import ( + EVENT_DATE_FIELD_NAME, + FISCAL_QUARTER_FIELD_NAME, + FISCAL_YEAR_FIELD_NAME, + SID_FIELD_NAME, + TS_FIELD_NAME, +) from zipline.pipeline.loaders.base import PipelineLoader from zipline.pipeline.loaders.frame import DataFrameLoader +from zipline.pipeline.loaders.utils import calc_backward_shift, \ + calc_forward_shift def required_event_fields(columns): @@ -15,7 +21,7 @@ def required_event_fields(columns): """ # These metadata columns are used to align event indexers. return { - AD_FIELD_NAME, + TS_FIELD_NAME, SID_FIELD_NAME, EVENT_DATE_FIELD_NAME, FISCAL_QUARTER_FIELD_NAME, @@ -48,23 +54,6 @@ def validate_column_specs(events, columns): ) -def calc_forward_shift(qtr, num_shifts): - yrs_to_shift, new_qtr = divmod(qtr + num_shifts, 4) - if yrs_to_shift == 1 and new_qtr == 0: - yrs_to_shift = 0 - new_qtr = 4 - return yrs_to_shift, new_qtr - - -def calc_backward_shift(qtr, num_shifts): - yrs_to_shift, new_qtr = divmod(abs(num_shifts - qtr), 4) - if yrs_to_shift == 0 and new_qtr == 0: - yrs_to_shift = 1 - new_qtr = 4 - yrs_to_shift = -yrs_to_shift - return yrs_to_shift, new_qtr - - class QuarterEstimatesLoader(PipelineLoader): def __init__(self, events, @@ -75,60 +64,45 @@ class QuarterEstimatesLoader(PipelineLoader): ) self.events = events[ - events[EVENT_DATE_FIELD_NAME].notnull() and - events[FISCAL_QUARTER_FIELD_NAME].notnull() and + events[EVENT_DATE_FIELD_NAME].notnull() & + events[FISCAL_QUARTER_FIELD_NAME].notnull() & events[FISCAL_YEAR_FIELD_NAME].notnull() ] self.columns = columns - def load_quarters(self, next_releases, num_quarters, dates_sids, gb): + def load_quarters(self, num_quarters, dates_sids, final_releases_per_qtr): pass def load_adjusted_array(self, columns, dates, assets, mask): groups = groupby(lambda x: x.dataset.num_quarters, columns) out = {} - date_values = pd.DataFrame(dates, columns=['dates']) + date_values = pd.DataFrame({'dates': dates}) date_values['key'] = 1 self.events['key'] = 1 merged = pd.merge(date_values, self.events, on='key') - asset_df = pd.DataFrame(assets, columns=['sid']) + asset_df = pd.DataFrame({'sid': assets}) asset_df['key'] = 1 dates_sids = pd.merge(date_values, asset_df, on='key') for num_quarters in groups: columns = groups[num_quarters] # First, group by sid, fiscal year, and fiscal quarter and only # keep the last estimate made. - final_releases_per_qtr = merged[merged.asof_date <= + final_releases_per_qtr = merged[merged[TS_FIELD_NAME] <= merged.dates].sort( - ['dates', 'asof_date'] + ['dates', TS_FIELD_NAME] ).groupby( ['dates', 'sid', 'fiscal_year', 'fiscal_quarter'] ).last() - gb = final_releases_per_qtr.reset_index().groupby(['dates', 'sid']) - # Split the date-sid combinations into ones with a next release - # and ones without - eligible_next_releases = pd.concat([group[1] for group in gb if ( - group[1][EVENT_DATE_FIELD_NAME] >= group[1]['dates'] - ).any()]) + final_releases_per_qtr = final_releases_per_qtr.reset_index() - eligible_next_releases.sort(EVENT_DATE_FIELD_NAME) - # For each sid, get the next release/year/quarter that we care - # about. - next_releases = eligible_next_releases.groupby( - ['dates', 'sid'] - ).min() - next_releases = next_releases.rename( - columns={'fiscal_year': 'next_fiscal_year', - 'fiscal_quarter': 'next_fiscal_quarter'} - ) - - result = self.load_quarters(next_releases, - num_quarters, - dates_sids) + result = self.load_quarters(num_quarters, + dates_sids, + final_releases_per_qtr) for c in columns: - column_name = self.columns[c.name] + super_col = getattr(c.dataset.__base__, c.name) + column_name = self.columns[super_col] # Need to pass a DataFrame that has dates as the index and # all sids as columns with column values being the value in # 'result' for column c @@ -147,9 +121,24 @@ class NextQuartersEstimatesLoader(QuarterEstimatesLoader): def __init__(self, events, columns): - super(NextQuartersEstimatesLoader).__init__(events, columns) + super(NextQuartersEstimatesLoader, self).__init__(events, columns) - def load_quarters(self, next_releases, num_quarters, dates_sids, gb): + def load_quarters(self, num_quarters, dates_sids, final_releases_per_qtr): + # Filter for releases that are after each simulation date. + eligible_next_releases = final_releases_per_qtr[ + final_releases_per_qtr[EVENT_DATE_FIELD_NAME] >= + final_releases_per_qtr['dates'] + ] + + eligible_next_releases.sort(EVENT_DATE_FIELD_NAME) + # For each sid, get the upcoming release/year/quarter. + next_releases = eligible_next_releases.groupby( + ['dates', 'sid'] + ).min() + next_releases = next_releases.rename( + columns={'fiscal_year': 'next_fiscal_year', + 'fiscal_quarter': 'next_fiscal_quarter'} + ) # `next_qtr` is already the next quarter over, # so we should offest `num_shifts` by 1. next_releases['fiscal_quarter'] = next_releases.apply( @@ -175,47 +164,39 @@ class PreviousQuartersEstimatesLoader(QuarterEstimatesLoader): def __init__(self, events, columns): - super(PreviousQuartersEstimatesLoader).__init__(events, columns) + super(PreviousQuartersEstimatesLoader, self).__init__(events, columns) - def load_quarters(self, next_releases, num_quarters, dates_sids, gb): - next_releases['fiscal_quarter'] = next_releases.apply( - lambda x: calc_backward_shift(x['next_fiscal_quarter'], - num_quarters)[1], - axis=1 - ) - next_releases['fiscal_year'] = next_releases.apply( - lambda x: - x['next_fiscal_year'] + - calc_backward_shift(x['next_fiscal_quarter'], - num_quarters)[0], - axis=1 - ) - only_previous_releases = pd.concat([group[1] for group in gb if ( - group[1][EVENT_DATE_FIELD_NAME] < group[1]['dates'] - ).all()]) - only_previous_releases.sort(EVENT_DATE_FIELD_NAME) + def load_quarters(self, num_quarters, dates_sids, final_releases_per_qtr): + # Filter for releases that are before each simulation date. + eligible_previous_releases = final_releases_per_qtr[ + final_releases_per_qtr[EVENT_DATE_FIELD_NAME] <= + final_releases_per_qtr['dates'] + ] + + eligible_previous_releases.sort(EVENT_DATE_FIELD_NAME) # For each sid, get the latest release we knew about prior to # each simulation date. - previous_releases = only_previous_releases.groupby(['dates', - 'sid']).max() + previous_releases = eligible_previous_releases.groupby( + ['dates', 'sid'] + ).max() + previous_releases = previous_releases.rename(columns={ 'fiscal_year': 'previous_fiscal_year', 'fiscal_quarter': 'previous_fiscal_quarter' }) previous_releases['fiscal_quarter'] = previous_releases.apply( lambda x: calc_backward_shift(x['previous_fiscal_quarter'], - num_quarters)[1], + (num_quarters - 1))[1], axis=1 ) previous_releases['fiscal_year'] = previous_releases.apply( lambda x: - x['previous_fiscal_year'] + + x['previous_fiscal_year'] - calc_backward_shift(x['previous_fiscal_quarter'], - num_quarters)[0], + (num_quarters - 1))[0], axis=1 ) - all_releases = pd.concat([next_releases, previous_releases]) # Merge to get the rows we care about for each date - result = dates_sids.merge(all_releases.reset_index(), + result = dates_sids.merge(previous_releases.reset_index(), on=(['dates', 'sid']), how='left') return result diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 2f388810..4e999b07 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -2,8 +2,6 @@ import datetime import numpy as np import pandas as pd -from zipline.pipeline.common import TS_FIELD_NAME, SID_FIELD_NAME -from zipline.pipeline.loaders.blaze.core import ffill_query_in_range from zipline.utils.pandas_utils import mask_between_time @@ -276,31 +274,59 @@ def check_data_query_args(data_query_time, data_query_tz): ) -def load_raw_data(assets, dates, data_query_time, data_query_tz, expr, - odo_kwargs): - lower_dt, upper_dt = normalize_data_query_bounds( - dates[0], - dates[-1], - data_query_time, - data_query_tz, - ) - raw = ffill_query_in_range( - expr, - lower_dt, - upper_dt, - odo_kwargs, - ) - sids = raw.loc[:, SID_FIELD_NAME] - raw.drop( - sids[~sids.isin(assets)].index, - inplace=True - ) - if data_query_time is not None: - normalize_timestamp_to_query_time( - raw, - data_query_time, - data_query_tz, - inplace=True, - ts_field=TS_FIELD_NAME, - ) - return raw +def calc_forward_shift(qtr, num_qtrs_shift): + """ + Calculate the number of years to shift forward and the new quarter in the + shifted year. + + Parameters + ---------- + qtr : int + The starting quarter. + num_qtr_shift : int + The number of quarters to shift forward. + + Returns + ------- + yrs_to_shift : int + The number of years to shift forward. + new_qtr : int + The quarter number of the new quarter after shifting num_qtrs_shift + forward from qtr. + """ + yrs_to_shift, new_qtr = divmod(qtr + num_qtrs_shift, 4) + if new_qtr == 0: + yrs_to_shift -= 1 + new_qtr = 4 + return yrs_to_shift, new_qtr + + +def calc_backward_shift(qtr, num_qtrs_shift): + """ + Calculate the number of years to shift backward and the new quarter in the + shifted year. + + Parameters + ---------- + qtr : int + The starting quarter. + num_qtr_shift : int + The number of quarters to shift backward. + + Returns + ------- + yrs_to_shift : int + The number of years to shift backward. + new_qtr : int + The quarter number of the new quarter after shifting num_qtrs_shift + backward from qtr. + """ + if qtr > num_qtrs_shift: + return 0, qtr - num_qtrs_shift + # num_qtrs_shift >= qtr; subtract to offset qtr, then calculate how many + # years/quarters to subtract. + yrs_to_shift, subtract_qtr = divmod(abs(num_qtrs_shift - qtr), 4) + # Must add 1 year since we go backwards at least `qtr` number of quarters + yrs_to_shift += 1 + new_qtr = 4 - subtract_qtr + return yrs_to_shift, new_qtr \ No newline at end of file From 863da5932c8e455c1caf7f277e9ee22e7c07491b Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Tue, 9 Aug 2016 16:24:03 -0400 Subject: [PATCH 03/16] TST: add tests for quarter rotation logic --- tests/pipeline/test_quarters_estimates.py | 281 +++++++++++------- zipline/pipeline/loaders/events.py | 23 +- zipline/pipeline/loaders/quarter_estimates.py | 186 ++++++++---- zipline/pipeline/loaders/utils.py | 68 +---- 4 files changed, 321 insertions(+), 237 deletions(-) diff --git a/tests/pipeline/test_quarters_estimates.py b/tests/pipeline/test_quarters_estimates.py index 02666964..2328f965 100644 --- a/tests/pipeline/test_quarters_estimates.py +++ b/tests/pipeline/test_quarters_estimates.py @@ -1,15 +1,24 @@ -from itertools import product +import itertools import numpy as np import pandas as pd +from pandas.util.testing import assert_series_equal from zipline.pipeline import SimplePipelineEngine, Pipeline from zipline.pipeline.data import DataSet, Column -from zipline.pipeline.loaders.quarter_estimates import \ - NextQuartersEstimatesLoader, PreviousQuartersEstimatesLoader +from zipline.pipeline.loaders.quarter_estimates import ( + NextQuartersEstimatesLoader, + PreviousQuartersEstimatesLoader +) +from zipline.pipeline.loaders.quarter_estimates import ( + calc_forward_shift, + calc_backward_shift +) from zipline.testing import ZiplineTestCase from zipline.testing.fixtures import WithAssetFinder, WithTradingSessions from zipline.testing.predicates import assert_equal from zipline.utils.numpy_utils import datetime64ns_dtype, float64_dtype +import line_profiler +prof = line_profiler.LineProfiler() class Estimates(DataSet): @@ -23,70 +32,106 @@ class Estimates(DataSet): def QuartersEstimates(num_qtr): class QtrEstimates(Estimates): num_quarters = num_qtr - name=Estimates + name = Estimates return QtrEstimates -# Final release dates never change +# Final release dates never change. The quarters have very tight date ranges +# in order to reduce the number of dates we need to iterate through when +# testing. releases = pd.DataFrame({ - 'sid': [1, 1], - 'timestamp': [pd.Timestamp('2015-01-20'), pd.Timestamp('2015-4-20')], - 'event_date': [pd.Timestamp('2015-01-20'), pd.Timestamp('2015-04-20')], + 'timestamp': [pd.Timestamp('2015-01-15'), pd.Timestamp('2015-01-31')], + 'event_date': [pd.Timestamp('2015-01-15'), pd.Timestamp('2015-01-31')], 'estimate': [0.5, 0.8], 'value': [0.6, 0.9], - 'fiscal_quarter': [1, 2], - 'fiscal_year': [2015, 2015] + 'fiscal_quarter': [1.0, 2.0], + 'fiscal_year': [2015.0, 2015.0] }) +q1_knowledge_dates = [pd.Timestamp('2015-01-01'), pd.Timestamp('2015-01-04'), + pd.Timestamp('2015-01-08'), pd.Timestamp('2015-01-12')] +q2_knowledge_dates = [pd.Timestamp('2015-01-16'), pd.Timestamp('2015-01-20'), + pd.Timestamp('2015-01-24'), pd.Timestamp('2015-01-28')] +# We want to model the possibility of an estimate predicting a release date +# that gets shifted forward/backward. +q1_release_dates = [pd.Timestamp('2015-01-13'), pd.Timestamp('2015-01-15')] +q2_release_dates = [pd.Timestamp('2015-01-28'), pd.Timestamp('2015-01-30')] estimates = pd.DataFrame({ - 'sid': [1, 1, 1, 1], - 'timestamp': [pd.Timestamp('2015-01-02'), - pd.Timestamp('2015-01-10'), - pd.Timestamp('2015-04-02'), - pd.Timestamp('2015-4-10')], - 'event_date': [pd.Timestamp('2015-01-20'), - pd.Timestamp('2015-01-20'), - pd.Timestamp('2015-04-20'), - pd.Timestamp('2015-04-20')], 'estimate': [.1, .2, .3, .4], 'value': [np.NaN, np.NaN, np.NaN, np.NaN], - 'fiscal_quarter': [1, 1, 2, 2], - 'fiscal_year': [2015, 2015, 2015, 2015] + 'fiscal_quarter': [1.0, 1.0, 2.0, 2.0], + 'fiscal_year': [2015.0, 2015.0, 2015.0, 2015.0] }) -events = pd.concat([releases, estimates]) + +def gen_estimates(): + sid_estimates = [] + sid_releases = [] + release_dates = list(itertools.product(q1_release_dates, q2_release_dates)) + knowledge_permutations = list(itertools.permutations(q1_knowledge_dates + + q2_knowledge_dates, + 4)) + all_permutations = itertools.product(knowledge_permutations, + release_dates) + for sid, ((q1e1, q1e2, q2e1, q2e2), (rd1, rd2)) in enumerate( + all_permutations): + # We're assuming that estimates must come before the relevant release. + if q1e1 < q1e2 and q2e1 < q2e2 and q1e1 < rd1 and q1e2 < \ + rd2: + sid_estimate = estimates.copy(True) + sid_estimate['timestamp'] = [q1e1, q1e2, q2e1, q2e2] + sid_estimate['event_date'] = [rd1]*2 + [rd2] * 2 + sid_estimate['sid'] = sid + sid_estimates += [sid_estimate] + sid_release = releases.copy(True) + sid_release['sid'] = sid_estimate['sid'] + sid_releases += [sid_release] + + return pd.concat(sid_estimates + sid_releases).reset_index(drop=True) -class NextEstimateTestCase(WithAssetFinder, - WithTradingSessions, - ZiplineTestCase): - START_DATE = pd.Timestamp('2015-01-01') - END_DATE = pd.Timestamp('2015-04-30') +class EstimateTestCase(WithAssetFinder, + WithTradingSessions, + ZiplineTestCase): + START_DATE = pd.Timestamp('2014-12-28') + END_DATE = pd.Timestamp('2015-02-03') + @classmethod + def make_loader(cls, events, columns): + pass + + @classmethod + def init_class_fixtures(cls): + cls.events = gen_estimates() + cls.sids = cls.events['sid'].unique() + cls.columns = { + Estimates.estimate: 'estimate', + Estimates.event_date: 'event_date', + Estimates.fiscal_quarter: 'fiscal_quarter', + Estimates.fiscal_year: 'fiscal_year', + Estimates.value: 'value', + } + cls.loader = cls.make_loader( + events=cls.events, + columns=cls.columns + ) + cls.ASSET_FINDER_EQUITY_SIDS = list(cls.events['sid'].unique()) + cls.ASSET_FINDER_EQUITY_SYMBOLS = [ + 's' + str(n) for n in cls.ASSET_FINDER_EQUITY_SIDS + ] + super(EstimateTestCase, cls).init_class_fixtures() + + +class NextEstimateTestCase(EstimateTestCase): @classmethod def make_loader(cls, events, columns): return NextQuartersEstimatesLoader(events, columns) - @classmethod - def init_class_fixtures(cls): - cls.events = events - cls.columns = { - Estimates.estimate: 'estimate', - Estimates.event_date: 'event_date', - Estimates.fiscal_quarter: 'fiscal_quarter', - Estimates.fiscal_year: 'fiscal_year', - Estimates.value: 'value', - } - cls.loader = cls.make_loader( - events=cls.events, - columns=cls.columns - ) - cls.ASSET_FINDER_EQUITY_SIDS = list(cls.events['sid'].unique()) - cls.ASSET_FINDER_EQUITY_SYMBOLS = [ - 's' + str(n) for n in cls.ASSET_FINDER_EQUITY_SIDS - ] - super(NextEstimateTestCase, cls).init_class_fixtures() - - def test_regular(self): + #@profile + def test_next_estimates(self): + """ + The goal of this test is to make sure that we select the right + datapoint as our 'next' w.r.t each date. + """ dataset = QuartersEstimates(1) engine = SimplePipelineEngine( lambda x: self.loader, @@ -99,55 +144,43 @@ class NextEstimateTestCase(WithAssetFinder, start_date=self.trading_days[0], end_date=self.trading_days[-1], ) - sid_events = results.xs(1, level=1) - ed_sorted_events = self.events.sort(['event_date', 'timestamp']) - for i, date in enumerate(sid_events.index): - # Get all upcoming events that we know about on 'date' - eligible_timestamps = ed_sorted_events[ed_sorted_events['timestamp'] - <= date] - eligible_events = eligible_timestamps[eligible_timestamps['event_date'] >= date] - if not eligible_events.empty: - smallest_event_date = eligible_events.iloc[0]['event_date'] - expected_event = eligible_events[eligible_events['event_date'] == smallest_event_date].iloc[-1] - for colname in sid_events.columns: - expected_value = expected_event[colname] - computed_value = sid_events.iloc[i][colname] - assert_equal(expected_value, computed_value) - else: - assert sid_events.iloc[i].isnull().all() + for sid in self.sids: + sid_events = results.xs(sid, level=1) + ed_sorted_events = self.events[ + self.events['sid'] == sid + ] + ed_sorted_events['key'] = 1 + all_dates = pd.DataFrame({'all_dates': sid_events.index}) + all_dates['key'] = 1 + crossproduct = pd.merge(all_dates, ed_sorted_events, on='key') + crossproduct = crossproduct[crossproduct['timestamp'] <= + crossproduct['all_dates']] + crossproduct = crossproduct[crossproduct['event_date'] >= + crossproduct['all_dates']] + final = crossproduct.sort_values(by=['all_dates', + 'event_date', + 'timestamp'], + ascending=[True, True, + False]).groupby([ + 'all_dates', 'sid']).first().reset_index() + final = pd.merge(final, all_dates, + how='right').sort_values(by='all_dates').set_index( + 'all_dates') + final.index.name = None + for colname in sid_events.columns: + assert_series_equal(final[colname], sid_events[colname]) -class PreviousEstimateTestCase(WithAssetFinder, - WithTradingSessions, - ZiplineTestCase): - START_DATE = pd.Timestamp('2015-01-01') - END_DATE = pd.Timestamp('2015-04-30') - +class PreviousEstimateTestCase(EstimateTestCase): @classmethod def make_loader(cls, events, columns): return PreviousQuartersEstimatesLoader(events, columns) - @classmethod - def init_class_fixtures(cls): - cls.events = events - cls.columns = { - Estimates.estimate: 'estimate', - Estimates.event_date: 'event_date', - Estimates.fiscal_quarter: 'fiscal_quarter', - Estimates.fiscal_year: 'fiscal_year', - Estimates.value: 'value', - } - cls.loader = cls.make_loader( - events=cls.events, - columns=cls.columns - ) - cls.ASSET_FINDER_EQUITY_SIDS = list(cls.events['sid'].unique()) - cls.ASSET_FINDER_EQUITY_SYMBOLS = [ - 's' + str(n) for n in cls.ASSET_FINDER_EQUITY_SIDS - ] - super(PreviousEstimateTestCase, cls).init_class_fixtures() - - def test_regular(self): + def test_previous_estimates(self): + """ + The goal of this test is to make sure that we select the right + datapoint as our 'previous' w.r.t each date. + """ dataset = QuartersEstimates(1) engine = SimplePipelineEngine( lambda x: self.loader, @@ -160,19 +193,53 @@ class PreviousEstimateTestCase(WithAssetFinder, start_date=self.trading_days[0], end_date=self.trading_days[-1], ) - sid_events = results.xs(1, level=1) - ed_sorted_events = self.events.sort(['event_date', 'timestamp']) - for i, date in enumerate(sid_events.index): - # Filter for events that happened on or before the simulation - # date and that we knew about on or before the simulation date. - ed_eligible_events = ed_sorted_events[ed_sorted_events['event_date'] <= date] - ts_eligible_events = ed_eligible_events[ed_eligible_events['timestamp'] <= date] - if not ts_eligible_events.empty: - # The expected event is the one we knew about last. - expected_event = ts_eligible_events.iloc[-1] - for colname in sid_events.columns: - expected_value = expected_event[colname] - computed_value = sid_events.iloc[i][colname] - assert_equal(expected_value, computed_value) - else: - assert sid_events.iloc[i].isnull().all() + for sid in self.sids: + sid_events = results.xs(sid, level=1) + ed_sorted_events = self.events[ + self.events['sid'] == sid + ].sort_values(by=['event_date', 'timestamp']) + for i, date in enumerate(sid_events.index): + # Filter for events that happened on or before the simulation + # date and that we knew about on or before the simulation date. + ed_eligible_events = ed_sorted_events[ed_sorted_events['event_date'] <= date] + ts_eligible_events = ed_eligible_events[ed_eligible_events['timestamp'] <= date] + if not ts_eligible_events.empty: + # The expected event is the one we knew about last. + expected_event = ts_eligible_events.iloc[-1] + for colname in sid_events.columns: + expected_value = expected_event[colname] + computed_value = sid_events.iloc[i][colname] + assert_equal(expected_value, computed_value) + else: + assert sid_events.iloc[i].isnull().all() + + +class QuarterShiftTestCase(ZiplineTestCase): + """ + This tests, in isolation, quarter calculation logic for shifting quarters + backwards/forwards from a starting point. + """ + def test_calc_forward_shift(self): + input_yrs = pd.Series([0] * 4) + input_qtrs = pd.Series(range(1, 5)) + expected = pd.DataFrame(([yr, qtr] for yr in range(0, 4) for qtr + in range(1, 5))) + for i in range(0, 8): + years, quarters = calc_forward_shift(input_yrs, input_qtrs, i) + # Can't use assert_series_equal here with check_names=False + # because that still fails due to name differences. + assert years.equals(expected[i:i+4].reset_index(drop=True)[0]) + assert quarters.equals(expected[i:i+4].reset_index(drop=True)[1]) + + + def test_calc_backward_shift(self): + input_yrs = pd.Series([0] * 4) + input_qtrs = pd.Series(range(4, 0, -1)) + expected = pd.DataFrame(([yr, qtr] for yr in range(0, -4, -1) for qtr + in range(4, 0, -1))) + for i in range(0, 8): + years, quarters = calc_backward_shift(input_yrs, input_qtrs, i) + # Can't use assert_series_equal here with check_names=False + # because that still fails due to name differences. + assert years.equals(expected[i:i+4].reset_index(drop=True)[0]) + assert quarters.equals(expected[i:i+4].reset_index(drop=True)[1]) diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index 1c49779c..af11499e 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -12,6 +12,7 @@ from zipline.pipeline.common import ( TS_FIELD_NAME, ) from zipline.pipeline.loaders.utils import ( + choose_rows_by_indexer, next_event_indexer, previous_event_indexer, ) @@ -166,7 +167,8 @@ class EventsLoader(PipelineLoader): if not columns: return {} - return self._load_events( + return choose_rows_by_indexer( + rows=self.events, name_map=self.next_value_columns, indexer=self.next_event_indexer(dates, sids), columns=columns, @@ -179,7 +181,8 @@ class EventsLoader(PipelineLoader): if not columns: return {} - return self._load_events( + return choose_rows_by_indexer( + rows=self.events, name_map=self.previous_value_columns, indexer=self.previous_event_indexer(dates, sids), columns=columns, @@ -188,22 +191,6 @@ class EventsLoader(PipelineLoader): mask=mask, ) - def _load_events(self, name_map, indexer, columns, dates, sids, mask): - def to_frame(array): - return pd.DataFrame(array, index=dates, columns=sids) - - out = {} - for c in columns: - raw = self.events[name_map[c]][indexer] - # indexer will be -1 for locations where we don't have a known - # value. - raw[indexer < 0] = c.missing_value - - # Delegate the actual array formatting logic to a DataFrameLoader. - loader = DataFrameLoader(c, to_frame(raw), adjustments=None) - out[c] = loader.load_adjusted_array([c], dates, sids, mask)[c] - return out - def load_adjusted_array(self, columns, dates, sids, mask): n, p = self.split_next_and_previous_event_columns(columns) return merge( diff --git a/zipline/pipeline/loaders/quarter_estimates.py b/zipline/pipeline/loaders/quarter_estimates.py index 6fb2d5d8..fa480b4d 100644 --- a/zipline/pipeline/loaders/quarter_estimates.py +++ b/zipline/pipeline/loaders/quarter_estimates.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd from six import viewvalues from toolz import groupby @@ -10,8 +11,84 @@ from zipline.pipeline.common import ( ) from zipline.pipeline.loaders.base import PipelineLoader from zipline.pipeline.loaders.frame import DataFrameLoader -from zipline.pipeline.loaders.utils import calc_backward_shift, \ - calc_forward_shift + +import line_profiler +from zipline.pipeline.loaders.utils import choose_rows_by_indexer + +PREVIOUS_FISCAL_QUARTER = 'previous_fiscal_quarter' + +PREVIOUS_FISCAL_YEAR = 'previous_fiscal_year' + +NEXT_FISCAL_QUARTER = 'next_fiscal_quarter' + +NEXT_FISCAL_YEAR = 'next_fiscal_year' + +FISCAL_QUARTER = 'fiscal_quarter' + +FISCAL_YEAR = 'fiscal_year' + +ALL_DATES = 'dates' + +prof = line_profiler.LineProfiler() + + +#@profile +def calc_forward_shift(yrs, qtrs, num_qtrs_shift): + """ + Calculate the number of years to shift forward and the new quarter in the + shifted year. + + Parameters + ---------- + qtr : int + The starting quarter. + num_qtr_shift : int + The number of quarters to shift forward. + yr : int + The starting year. + + Returns + ------- + s : pd.Series + A series containins the new year and quarter. + """ + + result_qtrs = (qtrs + num_qtrs_shift) % 4 + result_years = yrs + (qtrs + num_qtrs_shift) // 4 + to_adjust = result_qtrs[result_qtrs == 0].index + result_years.iloc[to_adjust] -= 1 + result_qtrs.iloc[to_adjust] = 4 + return result_years, result_qtrs + + +#@profile +def calc_backward_shift(yrs, qtrs, num_qtrs_shift): + """ + Calculate the number of years to shift backward and the new quarter in the + shifted year. + + Parameters + ---------- + qtr : int + The starting quarter. + num_qtr_shift : int + The number of quarters to shift backward. + yr : int + The starting year. + + Returns + ------- + s : pd.Series + A series containins the new year and quarter. + """ + result_qtrs = 4 - (num_qtrs_shift - qtrs) % 4 + # Must subtract 1 year since we go backwards at least `qtr` number of + # quarters + result_years = yrs - (num_qtrs_shift - qtrs) // 4 - 1 + no_yr_boundary_crossed = qtrs[qtrs > num_qtrs_shift].index + result_years.iloc[no_yr_boundary_crossed] = yrs.iloc[no_yr_boundary_crossed] + result_qtrs.iloc[no_yr_boundary_crossed] = qtrs.iloc[no_yr_boundary_crossed] - num_qtrs_shift + return result_years, result_qtrs def required_event_fields(columns): @@ -56,35 +133,40 @@ def validate_column_specs(events, columns): class QuarterEstimatesLoader(PipelineLoader): def __init__(self, - events, - columns): + estimates, + base_column_name_map): validate_column_specs( - events, - columns + estimates, + base_column_name_map ) - self.events = events[ - events[EVENT_DATE_FIELD_NAME].notnull() & - events[FISCAL_QUARTER_FIELD_NAME].notnull() & - events[FISCAL_YEAR_FIELD_NAME].notnull() + self.estimates = estimates[ + estimates[EVENT_DATE_FIELD_NAME].notnull() & + estimates[FISCAL_QUARTER_FIELD_NAME].notnull() & + estimates[FISCAL_YEAR_FIELD_NAME].notnull() ] - self.columns = columns + self.base_column_name_map = base_column_name_map def load_quarters(self, num_quarters, dates_sids, final_releases_per_qtr): pass + #@profile def load_adjusted_array(self, columns, dates, assets, mask): groups = groupby(lambda x: x.dataset.num_quarters, columns) out = {} date_values = pd.DataFrame({'dates': dates}) date_values['key'] = 1 - self.events['key'] = 1 - merged = pd.merge(date_values, self.events, on='key') - asset_df = pd.DataFrame({'sid': assets}) + self.estimates['key'] = 1 + merged = pd.merge(date_values, self.estimates, on='key') + asset_df = pd.DataFrame({SID_FIELD_NAME: assets}) asset_df['key'] = 1 dates_sids = pd.merge(date_values, asset_df, on='key') + merged.drop('key', axis=1, inplace=True) + dates_sids.drop('key', axis=1, inplace=True) for num_quarters in groups: + name_map = {c: self.base_column_name_map[getattr(c.dataset.__base__, c.name)] for c in columns} + columns = groups[num_quarters] # First, group by sid, fiscal year, and fiscal quarter and only # keep the last estimate made. @@ -92,7 +174,7 @@ class QuarterEstimatesLoader(PipelineLoader): merged.dates].sort( ['dates', TS_FIELD_NAME] ).groupby( - ['dates', 'sid', 'fiscal_year', 'fiscal_quarter'] + ['dates', SID_FIELD_NAME, FISCAL_YEAR, FISCAL_QUARTER] ).last() final_releases_per_qtr = final_releases_per_qtr.reset_index() @@ -101,15 +183,14 @@ class QuarterEstimatesLoader(PipelineLoader): final_releases_per_qtr) for c in columns: - super_col = getattr(c.dataset.__base__, c.name) - column_name = self.columns[super_col] + column_name = name_map[c] # Need to pass a DataFrame that has dates as the index and # all sids as columns with column values being the value in # 'result' for column c loader = DataFrameLoader( c, result.pivot(index='dates', - columns='sid', + columns=SID_FIELD_NAME, values=column_name), adjustments=None ) @@ -118,11 +199,8 @@ class QuarterEstimatesLoader(PipelineLoader): class NextQuartersEstimatesLoader(QuarterEstimatesLoader): - def __init__(self, - events, - columns): - super(NextQuartersEstimatesLoader, self).__init__(events, columns) + #@profile def load_quarters(self, num_quarters, dates_sids, final_releases_per_qtr): # Filter for releases that are after each simulation date. eligible_next_releases = final_releases_per_qtr[ @@ -133,39 +211,34 @@ class NextQuartersEstimatesLoader(QuarterEstimatesLoader): eligible_next_releases.sort(EVENT_DATE_FIELD_NAME) # For each sid, get the upcoming release/year/quarter. next_releases = eligible_next_releases.groupby( - ['dates', 'sid'] - ).min() + ['dates', SID_FIELD_NAME] + ).nth(0).reset_index() # We use nth here to avoid forward filling + # NaNs, which `first()` will do. next_releases = next_releases.rename( - columns={'fiscal_year': 'next_fiscal_year', - 'fiscal_quarter': 'next_fiscal_quarter'} + columns={FISCAL_YEAR: NEXT_FISCAL_YEAR, + FISCAL_QUARTER: NEXT_FISCAL_QUARTER} ) # `next_qtr` is already the next quarter over, # so we should offest `num_shifts` by 1. - next_releases['fiscal_quarter'] = next_releases.apply( - lambda x: calc_forward_shift(x['next_fiscal_quarter'], - num_quarters - 1)[1], - axis=1 - ) - next_releases['fiscal_year'] = next_releases.apply( - lambda x: - x['next_fiscal_year'] + - calc_forward_shift(x['next_fiscal_quarter'], - num_quarters - 1)[0], - axis=1 + (next_releases[FISCAL_YEAR], + next_releases[FISCAL_QUARTER]) = calc_forward_shift( + next_releases[NEXT_FISCAL_YEAR], + next_releases[NEXT_FISCAL_QUARTER], (num_quarters - 1) ) # Merge to get the rows we care about for each date - result = dates_sids.merge(next_releases.reset_index(), - on=(['dates', 'sid']), + result = dates_sids.merge(next_releases, + on=(['dates', SID_FIELD_NAME]), how='left') return result class PreviousQuartersEstimatesLoader(QuarterEstimatesLoader): def __init__(self, - events, + estimates, columns): - super(PreviousQuartersEstimatesLoader, self).__init__(events, columns) + super(PreviousQuartersEstimatesLoader, self).__init__(estimates, columns) + #@profile def load_quarters(self, num_quarters, dates_sids, final_releases_per_qtr): # Filter for releases that are before each simulation date. eligible_previous_releases = final_releases_per_qtr[ @@ -177,26 +250,23 @@ class PreviousQuartersEstimatesLoader(QuarterEstimatesLoader): # For each sid, get the latest release we knew about prior to # each simulation date. previous_releases = eligible_previous_releases.groupby( - ['dates', 'sid'] - ).max() + ['dates', SID_FIELD_NAME] + ).nth(-1).reset_index() # We use nth here to avoid forward filling + # NaNs, which `last()` will do. previous_releases = previous_releases.rename(columns={ - 'fiscal_year': 'previous_fiscal_year', - 'fiscal_quarter': 'previous_fiscal_quarter' + FISCAL_YEAR: PREVIOUS_FISCAL_YEAR, + FISCAL_QUARTER: PREVIOUS_FISCAL_QUARTER }) - previous_releases['fiscal_quarter'] = previous_releases.apply( - lambda x: calc_backward_shift(x['previous_fiscal_quarter'], - (num_quarters - 1))[1], - axis=1 - ) - previous_releases['fiscal_year'] = previous_releases.apply( - lambda x: - x['previous_fiscal_year'] - - calc_backward_shift(x['previous_fiscal_quarter'], - (num_quarters - 1))[0], - axis=1 + + (previous_releases[FISCAL_YEAR], + previous_releases[FISCAL_QUARTER]) = \ + calc_backward_shift( + previous_releases[PREVIOUS_FISCAL_YEAR], previous_releases[ + PREVIOUS_FISCAL_QUARTER], (num_quarters - 1) ) # Merge to get the rows we care about for each date - result = dates_sids.merge(previous_releases.reset_index(), - on=(['dates', 'sid']), how='left') + result = dates_sids.merge(previous_releases, + on=(['dates', SID_FIELD_NAME]), how='left') return result + diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 4e999b07..385022a8 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -2,6 +2,7 @@ import datetime import numpy as np import pandas as pd +from zipline.pipeline.loaders.frame import DataFrameLoader from zipline.utils.pandas_utils import mask_between_time @@ -274,59 +275,18 @@ def check_data_query_args(data_query_time, data_query_tz): ) -def calc_forward_shift(qtr, num_qtrs_shift): - """ - Calculate the number of years to shift forward and the new quarter in the - shifted year. +def choose_rows_by_indexer(rows, name_map, indexer, columns, dates, sids, mask): + def to_frame(array): + return pd.DataFrame(array, index=dates, columns=sids) - Parameters - ---------- - qtr : int - The starting quarter. - num_qtr_shift : int - The number of quarters to shift forward. + out = {} + for c in columns: + raw = rows[name_map[c]][indexer] + # indexer will be -1 for locations where we don't have a known + # value. + raw[indexer < 0] = c.missing_value - Returns - ------- - yrs_to_shift : int - The number of years to shift forward. - new_qtr : int - The quarter number of the new quarter after shifting num_qtrs_shift - forward from qtr. - """ - yrs_to_shift, new_qtr = divmod(qtr + num_qtrs_shift, 4) - if new_qtr == 0: - yrs_to_shift -= 1 - new_qtr = 4 - return yrs_to_shift, new_qtr - - -def calc_backward_shift(qtr, num_qtrs_shift): - """ - Calculate the number of years to shift backward and the new quarter in the - shifted year. - - Parameters - ---------- - qtr : int - The starting quarter. - num_qtr_shift : int - The number of quarters to shift backward. - - Returns - ------- - yrs_to_shift : int - The number of years to shift backward. - new_qtr : int - The quarter number of the new quarter after shifting num_qtrs_shift - backward from qtr. - """ - if qtr > num_qtrs_shift: - return 0, qtr - num_qtrs_shift - # num_qtrs_shift >= qtr; subtract to offset qtr, then calculate how many - # years/quarters to subtract. - yrs_to_shift, subtract_qtr = divmod(abs(num_qtrs_shift - qtr), 4) - # Must add 1 year since we go backwards at least `qtr` number of quarters - yrs_to_shift += 1 - new_qtr = 4 - subtract_qtr - return yrs_to_shift, new_qtr \ No newline at end of file + # Delegate the actual array formatting logic to a DataFrameLoader. + loader = DataFrameLoader(c, to_frame(raw), adjustments=None) + out[c] = loader.load_adjusted_array([c], dates, sids, mask)[c] + return out \ No newline at end of file From d578463dcf698ce6f3481a39629a004f2ed1d701 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Tue, 16 Aug 2016 08:28:50 -0400 Subject: [PATCH 04/16] TST: add cases for shifting release dates BUG: fix bugs in blaze loader BUG: call correct method MAINT: explicitly cast dates column MAINT: modify code to comply with pandas 0.16.1 --- tests/pipeline/test_quarters_estimates.py | 257 ++++++++++++------ zipline/pipeline/loaders/blaze/core.py | 32 +-- zipline/pipeline/loaders/blaze/estimates.py | 62 ++--- zipline/pipeline/loaders/blaze/events.py | 8 +- zipline/pipeline/loaders/blaze/utils.py | 61 +++++ zipline/pipeline/loaders/events.py | 23 +- zipline/pipeline/loaders/quarter_estimates.py | 234 +++++++--------- zipline/pipeline/loaders/utils.py | 18 -- 8 files changed, 372 insertions(+), 323 deletions(-) create mode 100644 zipline/pipeline/loaders/blaze/utils.py diff --git a/tests/pipeline/test_quarters_estimates.py b/tests/pipeline/test_quarters_estimates.py index 2328f965..f343bfb8 100644 --- a/tests/pipeline/test_quarters_estimates.py +++ b/tests/pipeline/test_quarters_estimates.py @@ -1,24 +1,30 @@ +import blaze as bz import itertools import numpy as np import pandas as pd -from pandas.util.testing import assert_series_equal -from zipline.pipeline import SimplePipelineEngine, Pipeline +from zipline.pipeline import SimplePipelineEngine, Pipeline +from zipline.pipeline.common import ( + EVENT_DATE_FIELD_NAME, + FISCAL_QUARTER_FIELD_NAME, + FISCAL_YEAR_FIELD_NAME, + SID_FIELD_NAME, + TS_FIELD_NAME, +) from zipline.pipeline.data import DataSet, Column +from zipline.pipeline.loaders.blaze.estimates import ( + BlazeNextEstimatesLoader, + BlazePreviousEstimatesLoader +) from zipline.pipeline.loaders.quarter_estimates import ( NextQuartersEstimatesLoader, PreviousQuartersEstimatesLoader ) -from zipline.pipeline.loaders.quarter_estimates import ( - calc_forward_shift, - calc_backward_shift -) +from zipline.pipeline.loaders.quarter_estimates import shift_quarters from zipline.testing import ZiplineTestCase from zipline.testing.fixtures import WithAssetFinder, WithTradingSessions from zipline.testing.predicates import assert_equal from zipline.utils.numpy_utils import datetime64ns_dtype, float64_dtype -import line_profiler -prof = line_profiler.LineProfiler() class Estimates(DataSet): @@ -39,12 +45,13 @@ def QuartersEstimates(num_qtr): # in order to reduce the number of dates we need to iterate through when # testing. releases = pd.DataFrame({ - 'timestamp': [pd.Timestamp('2015-01-15'), pd.Timestamp('2015-01-31')], - 'event_date': [pd.Timestamp('2015-01-15'), pd.Timestamp('2015-01-31')], + TS_FIELD_NAME: [pd.Timestamp('2015-01-15'), pd.Timestamp('2015-01-31')], + EVENT_DATE_FIELD_NAME: [pd.Timestamp('2015-01-15'), + pd.Timestamp('2015-01-31')], 'estimate': [0.5, 0.8], 'value': [0.6, 0.9], - 'fiscal_quarter': [1.0, 2.0], - 'fiscal_year': [2015.0, 2015.0] + FISCAL_QUARTER_FIELD_NAME: [1.0, 2.0], + FISCAL_YEAR_FIELD_NAME: [2015.0, 2015.0] }) q1_knowledge_dates = [pd.Timestamp('2015-01-01'), pd.Timestamp('2015-01-04'), @@ -52,40 +59,43 @@ q1_knowledge_dates = [pd.Timestamp('2015-01-01'), pd.Timestamp('2015-01-04'), q2_knowledge_dates = [pd.Timestamp('2015-01-16'), pd.Timestamp('2015-01-20'), pd.Timestamp('2015-01-24'), pd.Timestamp('2015-01-28')] # We want to model the possibility of an estimate predicting a release date -# that gets shifted forward/backward. -q1_release_dates = [pd.Timestamp('2015-01-13'), pd.Timestamp('2015-01-15')] -q2_release_dates = [pd.Timestamp('2015-01-28'), pd.Timestamp('2015-01-30')] +# that doesn't match the actual release. This could be done by dynamically +# generating more combinations with different release dates, but that +# significantly increases the amount of time it takes to run the tests. These +# hard-coded cases are sufficient to know that we can update our beliefs when +# we get new information. +q1_release_dates = [pd.Timestamp('2015-01-15'), + pd.Timestamp('2015-01-16')] # One day late +q2_release_dates = [pd.Timestamp('2015-01-30'), # One day early + pd.Timestamp('2015-01-31')] estimates = pd.DataFrame({ + EVENT_DATE_FIELD_NAME: q1_release_dates + q2_release_dates, 'estimate': [.1, .2, .3, .4], 'value': [np.NaN, np.NaN, np.NaN, np.NaN], - 'fiscal_quarter': [1.0, 1.0, 2.0, 2.0], - 'fiscal_year': [2015.0, 2015.0, 2015.0, 2015.0] + FISCAL_QUARTER_FIELD_NAME: [1.0, 1.0, 2.0, 2.0], + FISCAL_YEAR_FIELD_NAME: [2015.0, 2015.0, 2015.0, 2015.0] }) def gen_estimates(): sid_estimates = [] sid_releases = [] - release_dates = list(itertools.product(q1_release_dates, q2_release_dates)) - knowledge_permutations = list(itertools.permutations(q1_knowledge_dates + - q2_knowledge_dates, - 4)) - all_permutations = itertools.product(knowledge_permutations, - release_dates) - for sid, ((q1e1, q1e2, q2e1, q2e2), (rd1, rd2)) in enumerate( - all_permutations): + for sid, (q1e1, q1e2, q2e1, q2e2) in enumerate( + itertools.permutations(q1_knowledge_dates + q2_knowledge_dates, + 4) + ): # We're assuming that estimates must come before the relevant release. - if q1e1 < q1e2 and q2e1 < q2e2 and q1e1 < rd1 and q1e2 < \ - rd2: + if (q1e1 < q1e2 and + q2e1 < q2e2 and + q1e1 < q1_release_dates[0] and + q1e2 < q1_release_dates[1]): sid_estimate = estimates.copy(True) - sid_estimate['timestamp'] = [q1e1, q1e2, q2e1, q2e2] - sid_estimate['event_date'] = [rd1]*2 + [rd2] * 2 - sid_estimate['sid'] = sid + sid_estimate[TS_FIELD_NAME] = [q1e1, q1e2, q2e1, q2e2] + sid_estimate[SID_FIELD_NAME] = sid sid_estimates += [sid_estimate] sid_release = releases.copy(True) - sid_release['sid'] = sid_estimate['sid'] + sid_release[SID_FIELD_NAME] = sid_estimate[SID_FIELD_NAME] sid_releases += [sid_release] - return pd.concat(sid_estimates + sid_releases).reset_index(drop=True) @@ -105,28 +115,44 @@ class EstimateTestCase(WithAssetFinder, cls.sids = cls.events['sid'].unique() cls.columns = { Estimates.estimate: 'estimate', - Estimates.event_date: 'event_date', - Estimates.fiscal_quarter: 'fiscal_quarter', - Estimates.fiscal_year: 'fiscal_year', + Estimates.event_date: EVENT_DATE_FIELD_NAME, + Estimates.fiscal_quarter: FISCAL_QUARTER_FIELD_NAME, + Estimates.fiscal_year: FISCAL_YEAR_FIELD_NAME, Estimates.value: 'value', } cls.loader = cls.make_loader( events=cls.events, columns=cls.columns ) - cls.ASSET_FINDER_EQUITY_SIDS = list(cls.events['sid'].unique()) + cls.ASSET_FINDER_EQUITY_SIDS = list( + cls.events[SID_FIELD_NAME].unique() + ) cls.ASSET_FINDER_EQUITY_SYMBOLS = [ 's' + str(n) for n in cls.ASSET_FINDER_EQUITY_SIDS ] super(EstimateTestCase, cls).init_class_fixtures() + def _test_wrong_num_quarters_passed(self): + with self.assertRaises(ValueError): + dataset = QuartersEstimates(-1) + engine = SimplePipelineEngine( + lambda x: self.loader, + self.trading_days, + self.asset_finder, + ) + + engine.run_pipeline( + Pipeline({c.name: c.latest for c in dataset.columns}), + start_date=self.trading_days[0], + end_date=self.trading_days[-1], + ) + class NextEstimateTestCase(EstimateTestCase): @classmethod def make_loader(cls, events, columns): return NextQuartersEstimatesLoader(events, columns) - #@profile def test_next_estimates(self): """ The goal of this test is to make sure that we select the right @@ -145,30 +171,62 @@ class NextEstimateTestCase(EstimateTestCase): end_date=self.trading_days[-1], ) for sid in self.sids: - sid_events = results.xs(sid, level=1) - ed_sorted_events = self.events[ - self.events['sid'] == sid - ] - ed_sorted_events['key'] = 1 - all_dates = pd.DataFrame({'all_dates': sid_events.index}) - all_dates['key'] = 1 - crossproduct = pd.merge(all_dates, ed_sorted_events, on='key') - crossproduct = crossproduct[crossproduct['timestamp'] <= - crossproduct['all_dates']] - crossproduct = crossproduct[crossproduct['event_date'] >= - crossproduct['all_dates']] - final = crossproduct.sort_values(by=['all_dates', - 'event_date', - 'timestamp'], - ascending=[True, True, - False]).groupby([ - 'all_dates', 'sid']).first().reset_index() - final = pd.merge(final, all_dates, - how='right').sort_values(by='all_dates').set_index( - 'all_dates') - final.index.name = None - for colname in sid_events.columns: - assert_series_equal(final[colname], sid_events[colname]) + sid_estimates = results.xs(sid, level=1) + ts_sorted_estimates = self.events[ + self.events[SID_FIELD_NAME] == sid + ].sort(TS_FIELD_NAME) + for i, date in enumerate(sid_estimates.index): + comparable_date = date.tz_localize(None) + # Filter out estimates we don't know about yet. + ts_eligible_estimates = ts_sorted_estimates[ + ts_sorted_estimates[TS_FIELD_NAME] <= comparable_date + ] + expected_estimate = pd.DataFrame() + if not ts_eligible_estimates.empty: + q1_knowledge = ts_eligible_estimates[ + ts_eligible_estimates[FISCAL_QUARTER_FIELD_NAME] == 1 + ] + q2_knowledge = ts_eligible_estimates[ + ts_eligible_estimates[FISCAL_QUARTER_FIELD_NAME] == 2 + ] + + # If our latest knowledge of q1 is that the release is + # happening on this simulation date or later, then that's + # the estimate we want to use. + if (not q1_knowledge.empty and + q1_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] >= + comparable_date): + expected_estimate = q1_knowledge.iloc[-1] + # If q1 has already happened or we don't know about it + # yet and our latest knowledge indicates that q2 hasn't + # happend yet, then that's the estimate we want to use. + elif (not q2_knowledge.empty and + q2_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] >= + comparable_date): + expected_estimate = q2_knowledge.iloc[-1] + if not expected_estimate.empty: + for colname in sid_estimates.columns: + expected_value = expected_estimate[colname] + computed_value = sid_estimates.iloc[i][colname] + assert_equal(expected_value, computed_value) + else: + assert sid_estimates.iloc[i].isnull().all() + + def test_wrong_num_quarters_passed(self): + self._test_wrong_num_quarters_passed() + + +class BlazeNextEstimateLoaderTestCase(NextEstimateTestCase): + """ + Run the same tests as EventsLoaderTestCase, but using a BlazeEventsLoader. + """ + + @classmethod + def make_loader(cls, events, columns): + return BlazeNextEstimatesLoader( + bz.data(events), + columns, + ) class PreviousEstimateTestCase(EstimateTestCase): @@ -194,24 +252,62 @@ class PreviousEstimateTestCase(EstimateTestCase): end_date=self.trading_days[-1], ) for sid in self.sids: - sid_events = results.xs(sid, level=1) - ed_sorted_events = self.events[ - self.events['sid'] == sid - ].sort_values(by=['event_date', 'timestamp']) - for i, date in enumerate(sid_events.index): - # Filter for events that happened on or before the simulation - # date and that we knew about on or before the simulation date. - ed_eligible_events = ed_sorted_events[ed_sorted_events['event_date'] <= date] - ts_eligible_events = ed_eligible_events[ed_eligible_events['timestamp'] <= date] - if not ts_eligible_events.empty: - # The expected event is the one we knew about last. - expected_event = ts_eligible_events.iloc[-1] - for colname in sid_events.columns: - expected_value = expected_event[colname] - computed_value = sid_events.iloc[i][colname] + sid_estimates = results.xs(sid, level=1) + ts_sorted_estimates = self.events[ + self.events[SID_FIELD_NAME] == sid + ].sort(TS_FIELD_NAME) + for i, date in enumerate(sid_estimates.index): + comparable_date = date.tz_localize(None) + # Filter out estimates we don't know about yet. + ts_eligible_estimates = ts_sorted_estimates[ + ts_sorted_estimates[TS_FIELD_NAME] <= comparable_date + ] + expected_estimate = pd.DataFrame() + if not ts_eligible_estimates.empty: + # Determine the last piece of information we know about + # for q1 and q2. This takes advantage of the fact that we + # only have 2 quarters in the test data. + q1_knowledge = ts_eligible_estimates[ + ts_eligible_estimates[FISCAL_QUARTER_FIELD_NAME] == 1 + ] + q2_knowledge = ts_eligible_estimates[ + ts_eligible_estimates[FISCAL_QUARTER_FIELD_NAME] == 2 + ] + # The expected estimate will be for q2 if the last thing + # we've seen is that the release date already happened. + # Otherwise, it'll be for q1, as long as the release date + # for q1 has already happened. + if (not q2_knowledge.empty and + q2_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] <= + comparable_date): + expected_estimate = q2_knowledge.iloc[-1] + elif (not q1_knowledge.empty and + q1_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] <= + comparable_date): + expected_estimate = q1_knowledge.iloc[-1] + if not expected_estimate.empty: + for colname in sid_estimates.columns: + expected_value = expected_estimate[colname] + computed_value = sid_estimates.iloc[i][colname] assert_equal(expected_value, computed_value) else: - assert sid_events.iloc[i].isnull().all() + assert sid_estimates.iloc[i].isnull().all() + + def test_wrong_num_quarters_passed(self): + self._test_wrong_num_quarters_passed() + + +class BlazePreviousEstimateLoaderTestCase(PreviousEstimateTestCase): + """ + Run the same tests as EventsLoaderTestCase, but using a BlazeEventsLoader. + """ + + @classmethod + def make_loader(cls, events, columns): + return BlazePreviousEstimatesLoader( + bz.data(events), + columns, + ) class QuarterShiftTestCase(ZiplineTestCase): @@ -225,20 +321,19 @@ class QuarterShiftTestCase(ZiplineTestCase): expected = pd.DataFrame(([yr, qtr] for yr in range(0, 4) for qtr in range(1, 5))) for i in range(0, 8): - years, quarters = calc_forward_shift(input_yrs, input_qtrs, i) + years, quarters = shift_quarters(i, input_yrs, input_qtrs) # Can't use assert_series_equal here with check_names=False # because that still fails due to name differences. assert years.equals(expected[i:i+4].reset_index(drop=True)[0]) assert quarters.equals(expected[i:i+4].reset_index(drop=True)[1]) - def test_calc_backward_shift(self): input_yrs = pd.Series([0] * 4) input_qtrs = pd.Series(range(4, 0, -1)) expected = pd.DataFrame(([yr, qtr] for yr in range(0, -4, -1) for qtr in range(4, 0, -1))) - for i in range(0, 8): - years, quarters = calc_backward_shift(input_yrs, input_qtrs, i) + for i in range(0, 8, 1): + years, quarters = shift_quarters(-i, input_yrs, input_qtrs) # Can't use assert_series_equal here with check_names=False # because that still fails due to name differences. assert years.equals(expected[i:i+4].reset_index(drop=True)[0]) diff --git a/zipline/pipeline/loaders/blaze/core.py b/zipline/pipeline/loaders/blaze/core.py index fc3252e4..01e1e659 100644 --- a/zipline/pipeline/loaders/blaze/core.py +++ b/zipline/pipeline/loaders/blaze/core.py @@ -1219,36 +1219,6 @@ def bind_expression_to_resources(expr, resources): }) -def load_raw_data(assets, dates, data_query_time, data_query_tz, expr, - odo_kwargs): - lower_dt, upper_dt = normalize_data_query_bounds( - dates[0], - dates[-1], - data_query_time, - data_query_tz, - ) - raw = ffill_query_in_range( - expr, - lower_dt, - upper_dt, - odo_kwargs, - ) - sids = raw.loc[:, SID_FIELD_NAME] - raw.drop( - sids[~sids.isin(assets)].index, - inplace=True - ) - if data_query_time is not None: - normalize_timestamp_to_query_time( - raw, - data_query_time, - data_query_tz, - inplace=True, - ts_field=TS_FIELD_NAME, - ) - return raw - - def ffill_query_in_range(expr, lower, upper, @@ -1303,4 +1273,4 @@ def ffill_query_in_range(expr, **odo_kwargs ) raw.loc[:, ts_field] = raw.loc[:, ts_field].astype('datetime64[ns]') - return raw \ No newline at end of file + return raw diff --git a/zipline/pipeline/loaders/blaze/estimates.py b/zipline/pipeline/loaders/blaze/estimates.py index c2341164..81554b0d 100644 --- a/zipline/pipeline/loaders/blaze/estimates.py +++ b/zipline/pipeline/loaders/blaze/estimates.py @@ -2,14 +2,14 @@ from datashape import istabular from .core import ( bind_expression_to_resources, - load_raw_data, ) from zipline.pipeline.loaders.base import PipelineLoader -from zipline.pipeline.loaders.events import ( - required_event_fields, +from zipline.pipeline.loaders.blaze.utils import load_raw_data +from zipline.pipeline.loaders.quarter_estimates import ( + NextQuartersEstimatesLoader, + PreviousQuartersEstimatesLoader, + required_estimates_fields, ) -from zipline.pipeline.loaders.quarter_estimates import \ - NextQuartersEstimatesLoader, PreviousQuartersEstimatesLoader from zipline.pipeline.loaders.utils import ( check_data_query_args, ) @@ -47,7 +47,7 @@ class BlazeEstimatesLoader(PipelineLoader): And other dataset-specific fields, where each row of the table is a record including the sid to identify the company, the timestamp where we - learned about the announcement, and the date when the earnings will be z + learned about the announcement, and the date when the earnings will be announced. If the '{TS_FIELD_NAME}' field is not included it is assumed that we @@ -61,8 +61,7 @@ class BlazeEstimatesLoader(PipelineLoader): resources=None, odo_kwargs=None, data_query_time=None, - data_query_tz=None, - loader=None): + data_query_tz=None): dshape = expr.dshape if not istabular(dshape): @@ -71,7 +70,7 @@ class BlazeEstimatesLoader(PipelineLoader): ) required_cols = list( - required_event_fields(columns) + required_estimates_fields(columns) ) self._expr = bind_expression_to_resources( expr[required_cols], @@ -82,15 +81,18 @@ class BlazeEstimatesLoader(PipelineLoader): check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz - self.loader = loader def load_adjusted_array(self, columns, dates, assets, mask): - raw = load_raw_data(assets, dates, self._data_query_time, - self._data_query_tz, self._exp, self._odo_kwargs) + raw = load_raw_data(assets, + dates, + self._data_query_time, + self._data_query_tz, + self._expr, + self._odo_kwargs) return self.loader( - events=raw, - next_value_columns=self._columns, + raw, + self._columns, ).load_adjusted_array( columns, dates, @@ -102,38 +104,6 @@ class BlazeEstimatesLoader(PipelineLoader): class BlazeNextEstimatesLoader(BlazeEstimatesLoader): loader = NextQuartersEstimatesLoader - def __init__(self, - expr, - columns, - resources=None, - odo_kwargs=None, - data_query_time=None, - data_query_tz=None, - loader=None): - super(BlazeNextEstimatesLoader).__init__(expr, - columns, - resources, - odo_kwargs, - data_query_time, - data_query_tz, - loader) - class BlazePreviousEstimatesLoader(BlazeEstimatesLoader): loader = PreviousQuartersEstimatesLoader - - def __init__(self, - expr, - columns, - resources=None, - odo_kwargs=None, - data_query_time=None, - data_query_tz=None, - loader=None): - super(BlazeNextEstimatesLoader).__init__(expr, - columns, - resources, - odo_kwargs, - data_query_time, - data_query_tz, - loader) diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index d5ac8e37..39e5dac6 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -2,14 +2,16 @@ from datashape import istabular from .core import ( bind_expression_to_resources, - load_raw_data, ) from zipline.pipeline.loaders.base import PipelineLoader +from zipline.pipeline.loaders.blaze.utils import load_raw_data from zipline.pipeline.loaders.events import ( EventsLoader, required_event_fields, ) -from zipline.pipeline.loaders.utils import check_data_query_args +from zipline.pipeline.loaders.utils import ( + check_data_query_args, +) from zipline.utils.input_validation import ensure_timezone, optionally from zipline.utils.preprocess import preprocess @@ -29,7 +31,7 @@ class BlazeEventsLoader(PipelineLoader): data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str - The timezeone to use for the data query cutoff. + The timezone to use for the data query cutoff. dataset : DataSet The DataSet object for which this loader loads data. diff --git a/zipline/pipeline/loaders/blaze/utils.py b/zipline/pipeline/loaders/blaze/utils.py new file mode 100644 index 00000000..6455f76c --- /dev/null +++ b/zipline/pipeline/loaders/blaze/utils.py @@ -0,0 +1,61 @@ +from zipline.pipeline.common import SID_FIELD_NAME, TS_FIELD_NAME +from zipline.pipeline.loaders.blaze.core import ffill_query_in_range +from zipline.pipeline.loaders.utils import ( + normalize_data_query_bounds, + normalize_timestamp_to_query_time, +) + + +def load_raw_data(assets, dates, data_query_time, data_query_tz, expr, + odo_kwargs): + """ + given an expression representing data to load, perform normalization and + forward-filling and return the data, materialized. + + parameters + ---------- + assets : pd.int64index + the assets to load data for. + dates : pd.datetimeindex + the simulation dates to load data for. + data_query_time : datetime.time + the time used as cutoff for new information. + data_query_tz : tzinfo + the timezone to normalize your dates to before comparing against + `time`. + expr : expr + the expression representing the data to load. + odo_kwargs : dict, optional + extra keyword arguments to pass to odo when executing the expression. + + returns + ------- + raw : pd.dataframe + the data symbolized by `expr` materialized in a dataframe. + """ + lower_dt, upper_dt = normalize_data_query_bounds( + dates[0], + dates[-1], + data_query_time, + data_query_tz, + ) + raw = ffill_query_in_range( + expr, + lower_dt, + upper_dt, + odo_kwargs, + ) + sids = raw.loc[:, SID_FIELD_NAME] + raw.drop( + sids[~sids.isin(assets)].index, + inplace=True + ) + if data_query_time is not None: + normalize_timestamp_to_query_time( + raw, + data_query_time, + data_query_tz, + inplace=True, + ts_field=TS_FIELD_NAME, + ) + return raw diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index af11499e..cb33b3b2 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -5,14 +5,13 @@ from six import viewvalues from toolz import groupby, merge from .base import PipelineLoader -from .frame import DataFrameLoader from zipline.pipeline.common import ( EVENT_DATE_FIELD_NAME, SID_FIELD_NAME, TS_FIELD_NAME, ) +from zipline.pipeline.loaders.frame import DataFrameLoader from zipline.pipeline.loaders.utils import ( - choose_rows_by_indexer, next_event_indexer, previous_event_indexer, ) @@ -167,7 +166,7 @@ class EventsLoader(PipelineLoader): if not columns: return {} - return choose_rows_by_indexer( + return self._load_events( rows=self.events, name_map=self.next_value_columns, indexer=self.next_event_indexer(dates, sids), @@ -181,7 +180,7 @@ class EventsLoader(PipelineLoader): if not columns: return {} - return choose_rows_by_indexer( + return self._load_events( rows=self.events, name_map=self.previous_value_columns, indexer=self.previous_event_indexer(dates, sids), @@ -191,6 +190,22 @@ class EventsLoader(PipelineLoader): mask=mask, ) + def _load_events(self, name_map, indexer, columns, dates, sids, mask): + def to_frame(array): + return pd.DataFrame(array, index=dates, columns=sids) + + out = {} + for c in columns: + raw = self.events[name_map[c]][indexer] + # indexer will be -1 for locations where we don't have a known + # value. + raw[indexer < 0] = c.missing_value + + # Delegate the actual array formatting logic to a DataFrameLoader. + loader = DataFrameLoader(c, to_frame(raw), adjustments=None) + out[c] = loader.load_adjusted_array([c], dates, sids, mask)[c] + return out + def load_adjusted_array(self, columns, dates, sids, mask): n, p = self.split_next_and_previous_event_columns(columns) return merge( diff --git a/zipline/pipeline/loaders/quarter_estimates.py b/zipline/pipeline/loaders/quarter_estimates.py index fa480b4d..7837e57d 100644 --- a/zipline/pipeline/loaders/quarter_estimates.py +++ b/zipline/pipeline/loaders/quarter_estimates.py @@ -1,7 +1,8 @@ -import numpy as np +from abc import abstractmethod import pandas as pd from six import viewvalues from toolz import groupby + from zipline.pipeline.common import ( EVENT_DATE_FIELD_NAME, FISCAL_QUARTER_FIELD_NAME, @@ -11,90 +12,33 @@ from zipline.pipeline.common import ( ) from zipline.pipeline.loaders.base import PipelineLoader from zipline.pipeline.loaders.frame import DataFrameLoader - -import line_profiler -from zipline.pipeline.loaders.utils import choose_rows_by_indexer - -PREVIOUS_FISCAL_QUARTER = 'previous_fiscal_quarter' - -PREVIOUS_FISCAL_YEAR = 'previous_fiscal_year' +from zipline.utils.pandas_utils import cross_product NEXT_FISCAL_QUARTER = 'next_fiscal_quarter' - NEXT_FISCAL_YEAR = 'next_fiscal_year' - -FISCAL_QUARTER = 'fiscal_quarter' - -FISCAL_YEAR = 'fiscal_year' - -ALL_DATES = 'dates' - -prof = line_profiler.LineProfiler() +PREVIOUS_FISCAL_QUARTER = 'previous_fiscal_quarter' +PREVIOUS_FISCAL_YEAR = 'previous_fiscal_year' +SIMULTATION_DATES = 'dates' -#@profile -def calc_forward_shift(yrs, qtrs, num_qtrs_shift): - """ - Calculate the number of years to shift forward and the new quarter in the - shifted year. - - Parameters - ---------- - qtr : int - The starting quarter. - num_qtr_shift : int - The number of quarters to shift forward. - yr : int - The starting year. - - Returns - ------- - s : pd.Series - A series containins the new year and quarter. - """ - - result_qtrs = (qtrs + num_qtrs_shift) % 4 - result_years = yrs + (qtrs + num_qtrs_shift) // 4 - to_adjust = result_qtrs[result_qtrs == 0].index - result_years.iloc[to_adjust] -= 1 - result_qtrs.iloc[to_adjust] = 4 - return result_years, result_qtrs +def normalize_quarters(years, quarters): + return years * 4 + quarters - 1 -#@profile -def calc_backward_shift(yrs, qtrs, num_qtrs_shift): - """ - Calculate the number of years to shift backward and the new quarter in the - shifted year. - - Parameters - ---------- - qtr : int - The starting quarter. - num_qtr_shift : int - The number of quarters to shift backward. - yr : int - The starting year. - - Returns - ------- - s : pd.Series - A series containins the new year and quarter. - """ - result_qtrs = 4 - (num_qtrs_shift - qtrs) % 4 - # Must subtract 1 year since we go backwards at least `qtr` number of - # quarters - result_years = yrs - (num_qtrs_shift - qtrs) // 4 - 1 - no_yr_boundary_crossed = qtrs[qtrs > num_qtrs_shift].index - result_years.iloc[no_yr_boundary_crossed] = yrs.iloc[no_yr_boundary_crossed] - result_qtrs.iloc[no_yr_boundary_crossed] = qtrs.iloc[no_yr_boundary_crossed] - num_qtrs_shift - return result_years, result_qtrs +def split_normalized_quarters(normalized_quarters): + years = normalized_quarters // 4 + quarters = normalized_quarters % 4 + return years, quarters + 1 -def required_event_fields(columns): +def shift_quarters(by, years, quarters): + return split_normalized_quarters(normalize_quarters(years, quarters) + by) + + +def required_estimates_fields(columns): """ Compute the set of resource columns required to serve - ``next_value_columns`` and ``previous_value_columns``. + `columns`. """ # These metadata columns are used to align event indexers. return { @@ -112,16 +56,16 @@ def required_event_fields(columns): def validate_column_specs(events, columns): """ - Verify that the columns of ``events`` can be used by an EventsLoader to - serve the BoundColumns described by ``next_value_columns`` and - ``previous_value_columns``. + Verify that the columns of ``events`` can be used by a + QuarterEstimatesLoader to serve the BoundColumns described by + `columns`. """ - required = required_event_fields(columns) + required = required_estimates_fields(columns) received = set(events.columns) missing = required - received if missing: raise ValueError( - "EventsLoader missing required columns {missing}.\n" + "QuarterEstimatesLoader missing required columns {missing}.\n" "Got Columns: {received}\n" "Expected Columns: {required}".format( missing=sorted(missing), @@ -148,35 +92,45 @@ class QuarterEstimatesLoader(PipelineLoader): self.base_column_name_map = base_column_name_map + @abstractmethod def load_quarters(self, num_quarters, dates_sids, final_releases_per_qtr): pass - #@profile def load_adjusted_array(self, columns, dates, assets, mask): + # TODO: how can we enforce that datasets have the num_quarters + # attribute, given that they're created dynamically? groups = groupby(lambda x: x.dataset.num_quarters, columns) + groups_columns = dict(groups) + if (pd.Series(groups_columns.keys()) < 0).any(): + raise ValueError("Must pass a number of quarters >= 0") out = {} - date_values = pd.DataFrame({'dates': dates}) - date_values['key'] = 1 - self.estimates['key'] = 1 - merged = pd.merge(date_values, self.estimates, on='key') + date_values = pd.DataFrame({SIMULTATION_DATES: dates}) + # dates column must be of type datetime64[ns] in order for subsequent + # comparisons to work correctly. + date_values[SIMULTATION_DATES] = date_values[ + SIMULTATION_DATES + ].astype('datetime64[ns]') + estimates_all_dates = cross_product(date_values, self.estimates) asset_df = pd.DataFrame({SID_FIELD_NAME: assets}) - asset_df['key'] = 1 - dates_sids = pd.merge(date_values, asset_df, on='key') - merged.drop('key', axis=1, inplace=True) - dates_sids.drop('key', axis=1, inplace=True) - for num_quarters in groups: - name_map = {c: self.base_column_name_map[getattr(c.dataset.__base__, c.name)] for c in columns} + dates_sids = cross_product(date_values, asset_df) + for num_quarters, columns in groups_columns.iteritems(): + name_map = {c: + self.base_column_name_map[ + getattr(c.dataset.__base__, c.name) + ] for c in columns} - columns = groups[num_quarters] - # First, group by sid, fiscal year, and fiscal quarter and only - # keep the last estimate made. - final_releases_per_qtr = merged[merged[TS_FIELD_NAME] <= - merged.dates].sort( - ['dates', TS_FIELD_NAME] - ).groupby( - ['dates', SID_FIELD_NAME, FISCAL_YEAR, FISCAL_QUARTER] - ).last() - final_releases_per_qtr = final_releases_per_qtr.reset_index() + # First, determine which estimates we would have known about on + # each date. Then, Sort by timestamp and group to find the latest + # estimate for each quarter. + final_releases_per_qtr = estimates_all_dates[ + estimates_all_dates[TS_FIELD_NAME] <= + estimates_all_dates.dates + ].sort([TS_FIELD_NAME]).groupby( + [SIMULTATION_DATES, + SID_FIELD_NAME, + FISCAL_YEAR_FIELD_NAME, + FISCAL_QUARTER_FIELD_NAME] + ).nth(-1).reset_index() result = self.load_quarters(num_quarters, dates_sids, @@ -184,50 +138,51 @@ class QuarterEstimatesLoader(PipelineLoader): for c in columns: column_name = name_map[c] - # Need to pass a DataFrame that has dates as the index and - # all sids as columns with column values being the value in - # 'result' for column c + # Pivot to get a DataFrame with dates as the index and + # sids as the columns. loader = DataFrameLoader( c, - result.pivot(index='dates', + result.pivot(index=SIMULTATION_DATES, columns=SID_FIELD_NAME, values=column_name), adjustments=None ) - out[c] = loader.load_adjusted_array([c], dates, assets, mask)[c] + out[c] = loader.load_adjusted_array([c], + dates, + assets, + mask)[c] return out class NextQuartersEstimatesLoader(QuarterEstimatesLoader): - #@profile def load_quarters(self, num_quarters, dates_sids, final_releases_per_qtr): - # Filter for releases that are after each simulation date. + # Filter for releases that are on or after each simulation date. eligible_next_releases = final_releases_per_qtr[ final_releases_per_qtr[EVENT_DATE_FIELD_NAME] >= - final_releases_per_qtr['dates'] + final_releases_per_qtr[SIMULTATION_DATES] ] - + # For each sid, get the upcoming release. eligible_next_releases.sort(EVENT_DATE_FIELD_NAME) - # For each sid, get the upcoming release/year/quarter. next_releases = eligible_next_releases.groupby( - ['dates', SID_FIELD_NAME] + [SIMULTATION_DATES, SID_FIELD_NAME] ).nth(0).reset_index() # We use nth here to avoid forward filling # NaNs, which `first()` will do. next_releases = next_releases.rename( - columns={FISCAL_YEAR: NEXT_FISCAL_YEAR, - FISCAL_QUARTER: NEXT_FISCAL_QUARTER} + columns={FISCAL_YEAR_FIELD_NAME: NEXT_FISCAL_YEAR, + FISCAL_QUARTER_FIELD_NAME: NEXT_FISCAL_QUARTER} ) - # `next_qtr` is already the next quarter over, - # so we should offest `num_shifts` by 1. - (next_releases[FISCAL_YEAR], - next_releases[FISCAL_QUARTER]) = calc_forward_shift( + # The next fiscal quarter is already our starting point, + # so we should offset `num_quarters` by 1. + (next_releases[FISCAL_YEAR_FIELD_NAME], + next_releases[FISCAL_QUARTER_FIELD_NAME]) = shift_quarters( + (num_quarters - 1), next_releases[NEXT_FISCAL_YEAR], - next_releases[NEXT_FISCAL_QUARTER], (num_quarters - 1) + next_releases[NEXT_FISCAL_QUARTER], ) - # Merge to get the rows we care about for each date + # Do a left merge to get values for each date. result = dates_sids.merge(next_releases, - on=(['dates', SID_FIELD_NAME]), + on=([SIMULTATION_DATES, SID_FIELD_NAME]), how='left') return result @@ -236,37 +191,36 @@ class PreviousQuartersEstimatesLoader(QuarterEstimatesLoader): def __init__(self, estimates, columns): - super(PreviousQuartersEstimatesLoader, self).__init__(estimates, columns) + super(PreviousQuartersEstimatesLoader, self).__init__(estimates, + columns) - #@profile def load_quarters(self, num_quarters, dates_sids, final_releases_per_qtr): - # Filter for releases that are before each simulation date. + # Filter for releases that are on or before each simulation date. eligible_previous_releases = final_releases_per_qtr[ final_releases_per_qtr[EVENT_DATE_FIELD_NAME] <= - final_releases_per_qtr['dates'] + final_releases_per_qtr[SIMULTATION_DATES] ] - + # For each sid, get the latest release. eligible_previous_releases.sort(EVENT_DATE_FIELD_NAME) - # For each sid, get the latest release we knew about prior to - # each simulation date. previous_releases = eligible_previous_releases.groupby( - ['dates', SID_FIELD_NAME] + [SIMULTATION_DATES, SID_FIELD_NAME] ).nth(-1).reset_index() # We use nth here to avoid forward filling # NaNs, which `last()` will do. - previous_releases = previous_releases.rename(columns={ - FISCAL_YEAR: PREVIOUS_FISCAL_YEAR, - FISCAL_QUARTER: PREVIOUS_FISCAL_QUARTER + FISCAL_YEAR_FIELD_NAME: PREVIOUS_FISCAL_YEAR, + FISCAL_QUARTER_FIELD_NAME: PREVIOUS_FISCAL_QUARTER }) - - (previous_releases[FISCAL_YEAR], - previous_releases[FISCAL_QUARTER]) = \ - calc_backward_shift( - previous_releases[PREVIOUS_FISCAL_YEAR], previous_releases[ - PREVIOUS_FISCAL_QUARTER], (num_quarters - 1) + # The previous fiscal quarter is already our starting point, + # so we should offset `num_quarters` by 1. + (previous_releases[FISCAL_YEAR_FIELD_NAME], + previous_releases[FISCAL_QUARTER_FIELD_NAME]) = shift_quarters( + -(num_quarters - 1), + previous_releases[PREVIOUS_FISCAL_YEAR], + previous_releases[PREVIOUS_FISCAL_QUARTER], ) - # Merge to get the rows we care about for each date + # Do a left merge to get values for each date. result = dates_sids.merge(previous_releases, - on=(['dates', SID_FIELD_NAME]), how='left') + on=([SIMULTATION_DATES, + SID_FIELD_NAME]), + how='left') return result - diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 385022a8..77a9f447 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -2,7 +2,6 @@ import datetime import numpy as np import pandas as pd -from zipline.pipeline.loaders.frame import DataFrameLoader from zipline.utils.pandas_utils import mask_between_time @@ -273,20 +272,3 @@ def check_data_query_args(data_query_time, data_query_tz): data_query_tz, ), ) - - -def choose_rows_by_indexer(rows, name_map, indexer, columns, dates, sids, mask): - def to_frame(array): - return pd.DataFrame(array, index=dates, columns=sids) - - out = {} - for c in columns: - raw = rows[name_map[c]][indexer] - # indexer will be -1 for locations where we don't have a known - # value. - raw[indexer < 0] = c.missing_value - - # Delegate the actual array formatting logic to a DataFrameLoader. - loader = DataFrameLoader(c, to_frame(raw), adjustments=None) - out[c] = loader.load_adjusted_array([c], dates, sids, mask)[c] - return out \ No newline at end of file From e53d7fc9b2437c1234af14064b57aacca3ee03d3 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Wed, 17 Aug 2016 16:05:52 -0400 Subject: [PATCH 05/16] ENH: add an adjustment for float64 2d arrays. BUG: fix syntax error MAINT: optimize code for cython --- zipline/lib/adjustment.pyx | 79 +++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/zipline/lib/adjustment.pyx b/zipline/lib/adjustment.pyx index b0ba1873..fc133d79 100644 --- a/zipline/lib/adjustment.pyx +++ b/zipline/lib/adjustment.pyx @@ -3,7 +3,7 @@ from cpython cimport Py_EQ from pandas import isnull, Timestamp from numpy cimport float64_t, uint8_t, int64_t -from numpy import datetime64, float64 +from numpy import asarray, datetime64, float64 # Purely for readability. There aren't C-level declarations for these types. ctypedef object Int64Index_t ctypedef object DatetimeIndex_t @@ -364,6 +364,83 @@ cdef class Float64Overwrite(Float64Adjustment): data[row, col] = value +cdef class Float641DArrayOverwrite: + """ + An adjustment that overwrites subarrays with a value for each subarray. + + Example + ------- + + >>> import numpy as np + >>> arr = np.arange(25, dtype=float).reshape(5, 5) + >>> arr + array([[ 0., 1., 2., 3., 4.], + [ 5., 6., 7., 8., 9.], + [ 10., 11., 12., 13., 14.], + [ 15., 16., 17., 18., 19.], + [ 20., 21., 22., 23., 24.]]) + >>> adj = Float641DArrayOverwrite( + ... row_starts=np.array([0, 3]), + ... row_ends=np.array([2, 4]), + ... column_starts=np.array([0, 2]), + ... column_ends=np.array([1, 4]), + ... values=np.array([10., 20.]), + ) + >>> adj.mutate(arr) + >>> arr + array([[ 10., 10., 2., 3., 4.], + [ 10., 10., 7., 8., 9.], + [ 10., 10., 12., 13., 14.], + [ 15., 16., 20., 20., 20.], + [ 20., 21., 20., 20., 20.]]) + """ + cdef: + readonly int64_t[:] row_starts, row_ends, column_starts, column_ends + readonly float64_t[:] values + + def __init__(self, + int64_t[:] row_starts, + int64_t[:] row_ends, + int64_t[:] column_starts, + int64_t[:] column_ends, + float64_t[:] values): + assert (len(row_starts) == + len(row_ends) == + len(column_starts) == + len(column_ends)) + for (row_start, row_end) in zip(row_starts, row_ends): + assert row_start <= row_end + for (column_start, column_end) in zip(column_starts, column_ends): + assert column_start <= column_end + + self.row_starts = row_starts + self.row_ends = row_ends + self.column_starts = column_starts + self.column_ends = column_ends + self.values = values + + cpdef mutate(self, float64_t[:, :] data): + cdef Py_ssize_t fill_range, row, col + for fill_range in range(len(self.row_starts)): + for row in range(self.row_starts[fill_range], + self.row_ends[fill_range] + 1): + for col in range(self.column_starts[fill_range], + self.column_ends[fill_range] + 1): + data[row, col] = self.values[fill_range] + + def __repr__(self): + return ( + "%s(row_starts=%s, row_ends=%s," + " column_starts=%s, column_ends=%s, values=%s)" % ( + type(self).__name__, + asarray(self.row_starts), + asarray(self.row_ends), + asarray(self.column_starts), + asarray(self.column_ends), + asarray(self.values), + ) + ) + cdef class Float64Add(Float64Adjustment): """ An adjustment that adds a float. From 2975f9b2fd8df3c53598b7245e8c81ccc6cfec71 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Thu, 18 Aug 2016 14:19:48 -0400 Subject: [PATCH 06/16] TST: add test for 1d array overwrite --- tests/pipeline/test_adjusted_array.py | 101 ++++++++++++++ tests/pipeline/test_quarters_estimates.py | 1 - zipline/pipeline/loaders/blaze/core.py | 81 ++--------- zipline/pipeline/loaders/quarter_estimates.py | 127 +++++++++++------- zipline/pipeline/loaders/utils.py | 71 ++++++++++ 5 files changed, 261 insertions(+), 120 deletions(-) diff --git a/tests/pipeline/test_adjusted_array.py b/tests/pipeline/test_adjusted_array.py index cea6f090..80a1dd97 100644 --- a/tests/pipeline/test_adjusted_array.py +++ b/tests/pipeline/test_adjusted_array.py @@ -22,6 +22,7 @@ from zipline.lib.adjustment import ( Datetime64Overwrite, Float64Multiply, Float64Overwrite, + Float641DArrayOverwrite, ObjectOverwrite, ) from zipline.lib.adjusted_array import AdjustedArray, NOMASK @@ -304,6 +305,105 @@ def _gen_overwrite_adjustment_cases(name, ) +def _gen_overwrite_1d_array_adjustment_case(): + """ + Generate test cases for overwrite adjustments. + + The algorithm used here is the same as the one used above for + multiplicative adjustments. The only difference is the semantics of how + the adjustments are expected to modify the arrays. + + This is parameterized on `make_input` and `make_expected_output` functions, + which take 2-D lists of values and transform them into desired input/output + arrays. We do this so that we can easily test both vanilla numpy ndarrays + and our own LabelArray class for strings. + """ + + adjustments = {} + buffer_as_of = [None] * 6 + baseline = as_dtype(float64_dtype, [[2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + + buffer_as_of[0] = as_dtype(float64_dtype, [[2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + + # Note that row indices are inclusive! + adjustments[1] = [ + Float641DArrayOverwrite(array([0]), + array([0]), + array([0]), + array([0]), + as_dtype(float64_dtype, array([1]))) + ] + buffer_as_of[1] = as_dtype(float64_dtype, [[1, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + + # No adjustment at index 2. + buffer_as_of[2] = buffer_as_of[1] + + adjustments[3] = [ + Float641DArrayOverwrite(array([0, 2, 1]), + array([1, 2, 2]), + array([0, 0, 1]), + array([0, 0, 1]), + as_dtype(float64_dtype, array([4, 1, 3]))) + ] + buffer_as_of[3] = as_dtype(float64_dtype, [[4, 2, 2], + [4, 3, 2], + [1, 3, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + + adjustments[4] = [ + Float641DArrayOverwrite(array([0]), + array([3]), + array([2]), + array([2]), + as_dtype(float64_dtype, array([5]))) + ] + buffer_as_of[4] = as_dtype(float64_dtype, [[4, 2, 5], + [4, 3, 5], + [1, 3, 5], + [2, 2, 5], + [2, 2, 2], + [2, 2, 2]]) + + adjustments[5] = [ + Float641DArrayOverwrite(array([0, 2]), + array([4, 2]), + array([1, 2]), + array([1, 2]), + as_dtype(float64_dtype, array([6, 7]))), + ] + buffer_as_of[5] = as_dtype(float64_dtype, [[4, 6, 5], + [4, 6, 5], + [1, 6, 7], + [2, 6, 5], + [2, 6, 2], + [2, 2, 2]]) + + return _gen_expectations( + baseline, + default_missing_value_for_dtype(float64_dtype), + adjustments, + buffer_as_of, + nrows=6, + ) + + def _gen_expectations(baseline, missing_value, adjustments, @@ -442,6 +542,7 @@ class AdjustedArrayTestCase(TestCase): datetime64ns_dtype, ), ), + _gen_overwrite_1d_array_adjustment_case(), # There are six cases here: # Using np.bytes/np.unicode/object arrays as inputs. # Passing np.bytes/np.unicode/object arrays to LabelArray, diff --git a/tests/pipeline/test_quarters_estimates.py b/tests/pipeline/test_quarters_estimates.py index f343bfb8..1f82d161 100644 --- a/tests/pipeline/test_quarters_estimates.py +++ b/tests/pipeline/test_quarters_estimates.py @@ -20,7 +20,6 @@ from zipline.pipeline.loaders.quarter_estimates import ( NextQuartersEstimatesLoader, PreviousQuartersEstimatesLoader ) -from zipline.pipeline.loaders.quarter_estimates import shift_quarters from zipline.testing import ZiplineTestCase from zipline.testing.fixtures import WithAssetFinder, WithTradingSessions from zipline.testing.predicates import assert_equal diff --git a/zipline/pipeline/loaders/blaze/core.py b/zipline/pipeline/loaders/blaze/core.py index 01e1e659..c076029d 100644 --- a/zipline/pipeline/loaders/blaze/core.py +++ b/zipline/pipeline/loaders/blaze/core.py @@ -175,9 +175,10 @@ from zipline.pipeline.common import ( from zipline.pipeline.data.dataset import DataSet, Column from zipline.pipeline.loaders.utils import ( check_data_query_args, + last_in_date_group, normalize_data_query_bounds, normalize_timestamp_to_query_time, -) + ffill_across_cols) from zipline.pipeline.sentinels import NotSpecified from zipline.lib.adjusted_array import AdjustedArray, can_represent_dtype from zipline.lib.adjustment import Float64Overwrite @@ -869,9 +870,9 @@ def adjustments_from_deltas_with_sids(dense_dates, Parameters ---------- - dates : pd.DatetimeIndex - The dates requested by the loader. dense_dates : pd.DatetimeIndex + The dates requested by the loader. + sparse_dates : pd.DatetimeIndex The dates that were in the raw data. column_idx : int The index of the column in the dataset. @@ -1091,71 +1092,15 @@ class BlazeLoader(dict): ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) - def last_in_date_group(df, reindex, have_sids=have_sids): - idx = dates[dates.searchsorted( - df[TS_FIELD_NAME].values.astype('datetime64[D]') - )] - if have_sids: - idx = [idx, SID_FIELD_NAME] - - last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby( - idx, - sort=False, - ).last() - - if have_sids: - last_in_group = last_in_group.unstack() - - if reindex: - if have_sids: - cols = last_in_group.columns - last_in_group = last_in_group.reindex( - index=dates, - columns=pd.MultiIndex.from_product( - (cols.levels[0], assets), - names=cols.names, - ), - ) - else: - last_in_group = last_in_group.reindex(dates) - - return last_in_group - - sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False) - dense_output = last_in_date_group(sparse_output, reindex=True) - dense_output.ffill(inplace=True) - - # Fill in missing values specified by each column. This is made - # significantly more complex by the fact that we need to work around - # two pandas issues: - - # 1) When we have sids, if there are no records for a given sid for any - # dates, pandas will generate a column full of NaNs for that sid. - # This means that some of the columns in `dense_output` are now - # float instead of the intended dtype, so we have to coerce back to - # our expected type and convert NaNs into the desired missing value. - - # 2) DataFrame.ffill assumes that receiving None as a fill-value means - # that no value was passed. Consequently, there's no way to tell - # pandas to replace NaNs in an object column with None using fillna, - # so we have to roll our own instead using df.where. - for column in columns: - # Special logic for strings since `fillna` doesn't work if the - # missing value is `None`. - if column.dtype == categorical_dtype: - dense_output[column.name] = dense_output[ - column.name - ].where(pd.notnull(dense_output[column.name]), - column.missing_value) - else: - # We need to execute `fillna` before `astype` in case the - # column contains NaNs and needs to be cast to bool or int. - # This is so that the NaNs are replaced first, since pandas - # can't convert NaNs for those types. - dense_output[column.name] = dense_output[ - column.name - ].fillna(column.missing_value).astype(column.dtype) - + sparse_deltas = last_in_date_group(non_novel_deltas, + dates, + assets, + reindex=False) + dense_output = last_in_date_group(sparse_output, + dates, + assets, + reindex=True) + ffill_across_cols(dense_output, columns) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity diff --git a/zipline/pipeline/loaders/quarter_estimates.py b/zipline/pipeline/loaders/quarter_estimates.py index 7837e57d..325e9186 100644 --- a/zipline/pipeline/loaders/quarter_estimates.py +++ b/zipline/pipeline/loaders/quarter_estimates.py @@ -1,7 +1,11 @@ from abc import abstractmethod +from collections import defaultdict +import numpy as np import pandas as pd from six import viewvalues from toolz import groupby +from zipline.lib.adjusted_array import AdjustedArray +from zipline.lib.adjustment import Float641DArrayOverwrite from zipline.pipeline.common import ( EVENT_DATE_FIELD_NAME, @@ -13,6 +17,7 @@ from zipline.pipeline.common import ( from zipline.pipeline.loaders.base import PipelineLoader from zipline.pipeline.loaders.frame import DataFrameLoader from zipline.utils.pandas_utils import cross_product +from zipline.pipeline.loaders.utils import last_in_date_group, ffill_across_cols NEXT_FISCAL_QUARTER = 'next_fiscal_quarter' NEXT_FISCAL_YEAR = 'next_fiscal_year' @@ -31,10 +36,6 @@ def split_normalized_quarters(normalized_quarters): return years, quarters + 1 -def shift_quarters(by, years, quarters): - return split_normalized_quarters(normalize_quarters(years, quarters) + by) - - def required_estimates_fields(columns): """ Compute the set of resource columns required to serve @@ -93,15 +94,54 @@ class QuarterEstimatesLoader(PipelineLoader): self.base_column_name_map = base_column_name_map @abstractmethod - def load_quarters(self, num_quarters, dates_sids, final_releases_per_qtr): + def load_quarters(self, num_quarters, last, dates): pass + def get_adjustments(self, df, column, mask, assets, + final_releases_per_qtr, dates, raw_events): + adjustments = defaultdict(list) + for idx, sid in enumerate(assets): + # Get the releases for a particular sid + sid_data = final_releases_per_qtr[final_releases_per_qtr[ + SID_FIELD_NAME] == sid + ] + # Get the release dates for this sid - these are the quarter + # boundaries + qtr_boundaries, years, qtrs = sid_data[[ + EVENT_DATE_FIELD_NAME, + FISCAL_YEAR_FIELD_NAME, + FISCAL_QUARTER_FIELD_NAME + ]].unique() + next_qtr_starts = dates.searchsorted(qtr_boundaries, sid='right') + for idx, start in enumerate(next_qtr_starts): + # Here we need to take the new quarter and, for all dates in + # previous quarters, apply adjustments that use this + # quarter's values for those previous dates. + adjustments[start].extend(Float641DArrayOverwrite(first_row, + last_row, + idx, + idx, + value)) + return AdjustedArray( + df[column.name].values.astype(column.dtype), + mask, + adjustments_from_deltas( + dates, + sparse_output[TS_FIELD_NAME].values, + column_idx, + column.name, + asset_idx, + sparse_deltas, + ), + column.missing_value, + ) + def load_adjusted_array(self, columns, dates, assets, mask): # TODO: how can we enforce that datasets have the num_quarters # attribute, given that they're created dynamically? groups = groupby(lambda x: x.dataset.num_quarters, columns) groups_columns = dict(groups) - if (pd.Series(groups_columns.keys()) < 0).any(): + if (pd.Series(groups_columns) < 0).any(): raise ValueError("Must pass a number of quarters >= 0") out = {} date_values = pd.DataFrame({SIMULTATION_DATES: dates}) @@ -110,34 +150,36 @@ class QuarterEstimatesLoader(PipelineLoader): date_values[SIMULTATION_DATES] = date_values[ SIMULTATION_DATES ].astype('datetime64[ns]') - estimates_all_dates = cross_product(date_values, self.estimates) asset_df = pd.DataFrame({SID_FIELD_NAME: assets}) dates_sids = cross_product(date_values, asset_df) + self.estimates['normalized_quarters'] = normalize_quarters( + self.estimates[FISCAL_YEAR_FIELD_NAME], + self.estimates[FISCAL_QUARTER_FIELD_NAME], + ).astype(float) for num_quarters, columns in groups_columns.iteritems(): name_map = {c: self.base_column_name_map[ getattr(c.dataset.__base__, c.name) ] for c in columns} - - # First, determine which estimates we would have known about on - # each date. Then, Sort by timestamp and group to find the latest - # estimate for each quarter. - final_releases_per_qtr = estimates_all_dates[ - estimates_all_dates[TS_FIELD_NAME] <= - estimates_all_dates.dates - ].sort([TS_FIELD_NAME]).groupby( - [SIMULTATION_DATES, - SID_FIELD_NAME, - FISCAL_YEAR_FIELD_NAME, - FISCAL_QUARTER_FIELD_NAME] - ).nth(-1).reset_index() + # Determine the last piece of information we know for each column + # on each date in the index. + last = last_in_date_group(self.estimates, True, dates, + assets, + extra_groupers=[ + 'normalized_quarters']).reset_index() + # Forward fill values for each quarter. + ffill_across_cols(last, columns) + stacked = last.stack(1).stack(1).reset_index() result = self.load_quarters(num_quarters, - dates_sids, - final_releases_per_qtr) + stacked, dates) for c in columns: column_name = name_map[c] + pivoted = result.pivot(index=SIMULTATION_DATES, + columns=SID_FIELD_NAME, + values=column_name) + adjusted_array = self.get_adjustments(pivoted, c, mask, assets) # Pivot to get a DataFrame with dates as the index and # sids as the columns. loader = DataFrameLoader( @@ -145,7 +187,7 @@ class QuarterEstimatesLoader(PipelineLoader): result.pivot(index=SIMULTATION_DATES, columns=SID_FIELD_NAME, values=column_name), - adjustments=None + adjustments=adjusted_array ) out[c] = loader.load_adjusted_array([c], dates, @@ -156,34 +198,17 @@ class QuarterEstimatesLoader(PipelineLoader): class NextQuartersEstimatesLoader(QuarterEstimatesLoader): - def load_quarters(self, num_quarters, dates_sids, final_releases_per_qtr): - # Filter for releases that are on or after each simulation date. - eligible_next_releases = final_releases_per_qtr[ - final_releases_per_qtr[EVENT_DATE_FIELD_NAME] >= - final_releases_per_qtr[SIMULTATION_DATES] - ] - # For each sid, get the upcoming release. - eligible_next_releases.sort(EVENT_DATE_FIELD_NAME) - next_releases = eligible_next_releases.groupby( - [SIMULTATION_DATES, SID_FIELD_NAME] - ).nth(0).reset_index() # We use nth here to avoid forward filling - # NaNs, which `first()` will do. - next_releases = next_releases.rename( - columns={FISCAL_YEAR_FIELD_NAME: NEXT_FISCAL_YEAR, - FISCAL_QUARTER_FIELD_NAME: NEXT_FISCAL_QUARTER} - ) - # The next fiscal quarter is already our starting point, - # so we should offset `num_quarters` by 1. - (next_releases[FISCAL_YEAR_FIELD_NAME], - next_releases[FISCAL_QUARTER_FIELD_NAME]) = shift_quarters( - (num_quarters - 1), - next_releases[NEXT_FISCAL_YEAR], - next_releases[NEXT_FISCAL_QUARTER], - ) - # Do a left merge to get values for each date. - result = dates_sids.merge(next_releases, - on=([SIMULTATION_DATES, SID_FIELD_NAME]), - how='left') + def load_quarters(self, num_quarters, stacked, dates): + # Filter for releases that are on or after each simulation date and + # determine the next quarter by picking out the upcoming release for + # each date in the index. + event_date_idxs = dates.searchsorted(pd.to_datetime(stacked[EVENT_DATE_FIELD_NAME]).values) + next_releases = stacked.loc[event_date_idxs >= stacked['level_0']].groupby(['level_0', 'sid']).nth(0) + + + next_releases['shifted_normalized_quarters'] = next_releases[ + 'normalized_quarters'].convert_objects(convert_numeric=True) + (num_quarters - 1) + return result diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 77a9f447..28e0c90d 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -2,6 +2,8 @@ import datetime import numpy as np import pandas as pd +from zipline.pipeline.common import TS_FIELD_NAME, SID_FIELD_NAME +from zipline.utils.numpy_utils import categorical_dtype from zipline.utils.pandas_utils import mask_between_time @@ -272,3 +274,72 @@ def check_data_query_args(data_query_time, data_query_tz): data_query_tz, ), ) + + +def last_in_date_group(df, reindex, dates, assets, have_sids=True, + extra_groupers=[]): + idx = dates[dates.searchsorted( + df[TS_FIELD_NAME].values.astype('datetime64[D]') + )] + if have_sids: + idx = [idx, SID_FIELD_NAME] + extra_groupers + + last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby( + idx, + sort=False, + ).last() + + # For the number of things that we're grouping by (except TS), unstack + # the df + for _ in range(len(idx) - 1): + last_in_group = last_in_group.unstack() + + if reindex: + if have_sids: + cols = last_in_group.columns + last_in_group = last_in_group.reindex( + index=dates, + columns=pd.MultiIndex.from_product( + tuple(cols.levels[0:len(extra_groupers) + 1]) + (assets,), + names=cols.names, + ), + ) + else: + last_in_group = last_in_group.reindex(dates) + + return last_in_group + + +def ffill_across_cols(df, columns): + df.ffill(inplace=True) + + # Fill in missing values specified by each column. This is made + # significantly more complex by the fact that we need to work around + # two pandas issues: + + # 1) When we have sids, if there are no records for a given sid for any + # dates, pandas will generate a column full of NaNs for that sid. + # This means that some of the columns in `dense_output` are now + # float instead of the intended dtype, so we have to coerce back to + # our expected type and convert NaNs into the desired missing value. + + # 2) DataFrame.ffill assumes that receiving None as a fill-value means + # that no value was passed. Consequently, there's no way to tell + # pandas to replace NaNs in an object column with None using fillna, + # so we have to roll our own instead using df.where. + for column in columns: + # Special logic for strings since `fillna` doesn't work if the + # missing value is `None`. + if column.dtype == categorical_dtype: + df[column.name] = df[ + column.name + ].where(pd.notnull(df[column.name]), + column.missing_value) + else: + # We need to execute `fillna` before `astype` in case the + # column contains NaNs and needs to be cast to bool or int. + # This is so that the NaNs are replaced first, since pandas + # can't convert NaNs for those types. + df[column.name] = df[ + column.name + ].fillna(column.missing_value).astype(column.dtype) From 199c775fbf6b9a0a6761860a17591d259328bfef Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Mon, 22 Aug 2016 19:52:42 -0400 Subject: [PATCH 07/16] ENH: add adjustment for datetime64 arrays BUG: fix adjustment start index --- zipline/lib/adjustment.pyx | 161 +++++++++++++----- zipline/pipeline/loaders/quarter_estimates.py | 142 ++++++++------- 2 files changed, 190 insertions(+), 113 deletions(-) diff --git a/zipline/lib/adjustment.pyx b/zipline/lib/adjustment.pyx index fc133d79..036c5026 100644 --- a/zipline/lib/adjustment.pyx +++ b/zipline/lib/adjustment.pyx @@ -364,7 +364,39 @@ cdef class Float64Overwrite(Float64Adjustment): data[row, col] = value -cdef class Float641DArrayOverwrite: +cdef class ArrayAdjustment(Adjustment): + """ + Base class for ArrayAdjustments. + + Subclasses should inherit and provide a `values` attribute and a `mutate` + method. + """ + def __init__(self, + int64_t first_row, + int64_t last_row, + int64_t first_col, + int64_t last_col): + super(ArrayAdjustment, self).__init__( + first_row=first_row, + last_row=last_row, + first_col=first_col, + last_col=last_col, + ) + + def __repr__(self): + return ( + "%s(first_row=%d, last_row=%d," + " first_col=%d, last_col=%d, values=%s)" % ( + type(self).__name__, + self.first_row, + self.last_row, + self.first_col, + self.last_col, + asarray(self.values), + ) + ) + +cdef class Float641DArrayOverwrite(ArrayAdjustment): """ An adjustment that overwrites subarrays with a value for each subarray. @@ -380,66 +412,101 @@ cdef class Float641DArrayOverwrite: [ 15., 16., 17., 18., 19.], [ 20., 21., 22., 23., 24.]]) >>> adj = Float641DArrayOverwrite( - ... row_starts=np.array([0, 3]), - ... row_ends=np.array([2, 4]), - ... column_starts=np.array([0, 2]), - ... column_ends=np.array([1, 4]), - ... values=np.array([10., 20.]), + ... row_start=0, + ... row_end=3, + ... column_start=0, + ... column_end=0, + ... values=np.array([1, 2, 3, 4]), ) >>> adj.mutate(arr) >>> arr - array([[ 10., 10., 2., 3., 4.], - [ 10., 10., 7., 8., 9.], - [ 10., 10., 12., 13., 14.], - [ 15., 16., 20., 20., 20.], - [ 20., 21., 20., 20., 20.]]) + array([[ 1., 1., 2., 3., 4.], + [ 2., 6., 7., 8., 9.], + [ 3., 11., 12., 13., 14.], + [ 4., 16., 17., 18., 19.], + [ 20., 21., 22., 23., 24.]]) """ cdef: - readonly int64_t[:] row_starts, row_ends, column_starts, column_ends readonly float64_t[:] values def __init__(self, - int64_t[:] row_starts, - int64_t[:] row_ends, - int64_t[:] column_starts, - int64_t[:] column_ends, + int64_t first_row, + int64_t last_row, + int64_t first_col, + int64_t last_col, float64_t[:] values): - assert (len(row_starts) == - len(row_ends) == - len(column_starts) == - len(column_ends)) - for (row_start, row_end) in zip(row_starts, row_ends): - assert row_start <= row_end - for (column_start, column_end) in zip(column_starts, column_ends): - assert column_start <= column_end - - self.row_starts = row_starts - self.row_ends = row_ends - self.column_starts = column_starts - self.column_ends = column_ends + super(Float641DArrayOverwrite, self).__init__( + first_row=first_row, + last_row=last_row, + first_col=first_col, + last_col=last_col, + ) + assert (last_row + 1 - first_row) == len(values) self.values = values cpdef mutate(self, float64_t[:, :] data): cdef Py_ssize_t fill_range, row, col - for fill_range in range(len(self.row_starts)): - for row in range(self.row_starts[fill_range], - self.row_ends[fill_range] + 1): - for col in range(self.column_starts[fill_range], - self.column_ends[fill_range] + 1): - data[row, col] = self.values[fill_range] + cdef float64_t[:] values = self.values + for col in range(self.first_col, self.last_col + 1): + for i, row in enumerate(range(self.first_row, self.last_row + 1)): + data[row, col] = values[i] + + +cdef class Datetime641DArrayOverwrite(ArrayAdjustment): + """ + An adjustment that overwrites subarrays with a value for each subarray. + + Example + ------- + + >>> import numpy as np + >>> arr = np.arange(25, dtype=float).reshape(5, 5) + >>> arr + array([[ 0., 1., 2., 3., 4.], + [ 5., 6., 7., 8., 9.], + [ 10., 11., 12., 13., 14.], + [ 15., 16., 17., 18., 19.], + [ 20., 21., 22., 23., 24.]]) + >>> adj = Datetime641DArrayOverwrite( + ... row_start=0, + ... row_end=3, + ... column_start=0, + ... column_end=0, + ... values=np.array([1, 2, 3, 4]), + ) + >>> adj.mutate(arr) + >>> arr + array([[ 1., 1., 2., 3., 4.], + [ 2., 6., 7., 8., 9.], + [ 3., 11., 12., 13., 14.], + [ 4., 16., 17., 18., 19.], + [ 20., 21., 22., 23., 24.]]) + """ + cdef: + readonly int64_t[:] values + + def __init__(self, + int64_t first_row, + int64_t last_row, + int64_t first_col, + int64_t last_col, + object values): + super(Datetime641DArrayOverwrite, self).__init__( + first_row=first_row, + last_row=last_row, + first_col=first_col, + last_col=last_col, + ) + assert (last_row + 1 - first_row) == len(values) + self.values = asarray([datetime_to_int(value) for value in values]) + + cpdef mutate(self, int64_t[:, :] data): + cdef Py_ssize_t row, col + cdef int64_t[:] values = self.values + for col in range(self.first_col, self.last_col + 1): + for i, row in enumerate(range(self.first_row, self.last_row + 1)): + data[row, col] = values[i] - def __repr__(self): - return ( - "%s(row_starts=%s, row_ends=%s," - " column_starts=%s, column_ends=%s, values=%s)" % ( - type(self).__name__, - asarray(self.row_starts), - asarray(self.row_ends), - asarray(self.column_starts), - asarray(self.column_ends), - asarray(self.values), - ) - ) cdef class Float64Add(Float64Adjustment): """ diff --git a/zipline/pipeline/loaders/quarter_estimates.py b/zipline/pipeline/loaders/quarter_estimates.py index 325e9186..7d9ce0c4 100644 --- a/zipline/pipeline/loaders/quarter_estimates.py +++ b/zipline/pipeline/loaders/quarter_estimates.py @@ -1,11 +1,14 @@ from abc import abstractmethod from collections import defaultdict +from functools import partial import numpy as np +from numpy.ma import asarray import pandas as pd from six import viewvalues -from toolz import groupby +from toolz import groupby, curry from zipline.lib.adjusted_array import AdjustedArray -from zipline.lib.adjustment import Float641DArrayOverwrite +from zipline.lib.adjustment import (Datetime641DArrayOverwrite, + Float641DArrayOverwrite) from zipline.pipeline.common import ( EVENT_DATE_FIELD_NAME, @@ -16,6 +19,7 @@ from zipline.pipeline.common import ( ) from zipline.pipeline.loaders.base import PipelineLoader from zipline.pipeline.loaders.frame import DataFrameLoader +from zipline.utils.numpy_utils import datetime64ns_dtype from zipline.utils.pandas_utils import cross_product from zipline.pipeline.loaders.utils import last_in_date_group, ffill_across_cols @@ -97,42 +101,49 @@ class QuarterEstimatesLoader(PipelineLoader): def load_quarters(self, num_quarters, last, dates): pass - def get_adjustments(self, df, column, mask, assets, - final_releases_per_qtr, dates, raw_events): + def get_adjustments(self, result, col_result, last, + column_name, + column, mask, + assets): adjustments = defaultdict(list) - for idx, sid in enumerate(assets): - # Get the releases for a particular sid - sid_data = final_releases_per_qtr[final_releases_per_qtr[ - SID_FIELD_NAME] == sid + if column.dtype == datetime64ns_dtype: + overwrite = Datetime641DArrayOverwrite + else: + overwrite = Float641DArrayOverwrite + for sid_idx, sid in enumerate(assets): + sid_result = result[result.index.get_level_values( + SID_FIELD_NAME + ) == sid] + sid_result = sid_result.reset_index( + level='shifted_normalized_quarters' + ) # Remove qtrs from index to find shifts + # Figure out where we think quarters are changing. + qtr_shifts = sid_result[ + sid_result['shifted_normalized_quarters'] != + sid_result['shifted_normalized_quarters'].shift(1) ] - # Get the release dates for this sid - these are the quarter - # boundaries - qtr_boundaries, years, qtrs = sid_data[[ - EVENT_DATE_FIELD_NAME, - FISCAL_YEAR_FIELD_NAME, - FISCAL_QUARTER_FIELD_NAME - ]].unique() - next_qtr_starts = dates.searchsorted(qtr_boundaries, sid='right') - for idx, start in enumerate(next_qtr_starts): - # Here we need to take the new quarter and, for all dates in - # previous quarters, apply adjustments that use this - # quarter's values for those previous dates. - adjustments[start].extend(Float641DArrayOverwrite(first_row, - last_row, - idx, - idx, - value)) + # Iterate backwards. No adjustment for 1st quarter. + for row_indexer in list(reversed(qtr_shifts.index))[:-1]: + # We want to write the values for this row's quarter over + # everything that comes before this quarter when we are at + # the date before this quarter starts. + qtr_start_idx = last.index.get_loc(row_indexer[0]) + quarter = qtr_shifts.loc[row_indexer][ + 'shifted_normalized_quarters' + ] + adjustments[qtr_start_idx] = \ + [overwrite(0, + qtr_start_idx - 1, # get index date + sid_idx, + sid_idx, + last[column_name, quarter, + sid][:qtr_start_idx].values) + ] + return AdjustedArray( - df[column.name].values.astype(column.dtype), + col_result.values.astype(column.dtype), mask, - adjustments_from_deltas( - dates, - sparse_output[TS_FIELD_NAME].values, - column_idx, - column.name, - asset_idx, - sparse_deltas, - ), + dict(adjustments), column.missing_value, ) @@ -150,8 +161,6 @@ class QuarterEstimatesLoader(PipelineLoader): date_values[SIMULTATION_DATES] = date_values[ SIMULTATION_DATES ].astype('datetime64[ns]') - asset_df = pd.DataFrame({SID_FIELD_NAME: assets}) - dates_sids = cross_product(date_values, asset_df) self.estimates['normalized_quarters'] = normalize_quarters( self.estimates[FISCAL_YEAR_FIELD_NAME], self.estimates[FISCAL_QUARTER_FIELD_NAME], @@ -166,50 +175,51 @@ class QuarterEstimatesLoader(PipelineLoader): last = last_in_date_group(self.estimates, True, dates, assets, extra_groupers=[ - 'normalized_quarters']).reset_index() + 'normalized_quarters']) # Forward fill values for each quarter. ffill_across_cols(last, columns) - stacked = last.stack(1).stack(1).reset_index() + stacked = last.stack(1).stack(1) - result = self.load_quarters(num_quarters, - stacked, dates) + result = self.load_quarters(num_quarters, stacked) for c in columns: column_name = name_map[c] - pivoted = result.pivot(index=SIMULTATION_DATES, - columns=SID_FIELD_NAME, - values=column_name) - adjusted_array = self.get_adjustments(pivoted, c, mask, assets) - # Pivot to get a DataFrame with dates as the index and - # sids as the columns. - loader = DataFrameLoader( - c, - result.pivot(index=SIMULTATION_DATES, - columns=SID_FIELD_NAME, - values=column_name), - adjustments=adjusted_array - ) - out[c] = loader.load_adjusted_array([c], - dates, - assets, - mask)[c] + col_result = result[ + column_name + ].reset_index(1, drop=True).unstack(1).reindex(dates) + adjusted_array = self.get_adjustments(result, + col_result, + last, + column_name, + c, + mask, + assets) + out[c] = adjusted_array return out class NextQuartersEstimatesLoader(QuarterEstimatesLoader): - def load_quarters(self, num_quarters, stacked, dates): + def load_quarters(self, num_quarters, stacked): # Filter for releases that are on or after each simulation date and # determine the next quarter by picking out the upcoming release for # each date in the index. - event_date_idxs = dates.searchsorted(pd.to_datetime(stacked[EVENT_DATE_FIELD_NAME]).values) - next_releases = stacked.loc[event_date_idxs >= stacked['level_0']].groupby(['level_0', 'sid']).nth(0) - - - next_releases['shifted_normalized_quarters'] = next_releases[ - 'normalized_quarters'].convert_objects(convert_numeric=True) + (num_quarters - 1) - - return result + stacked = stacked.sort(EVENT_DATE_FIELD_NAME) + next_releases = stacked.loc[ + stacked[EVENT_DATE_FIELD_NAME] >= stacked.index.get_level_values( + 0 + )].groupby(level=[0, 2]).nth(0) + next_releases[ + 'shifted_normalized_quarters' + ] = next_releases.index.get_level_values( + 'normalized_quarters' + ) + (num_quarters - 1) + next_releases = next_releases.set_index([ + next_releases.index.get_level_values(0), # dates + 'shifted_normalized_quarters', + next_releases.index.get_level_values(2) # sids + ]) + return stacked.loc[next_releases.index] class PreviousQuartersEstimatesLoader(QuarterEstimatesLoader): From ebbe85b79a62d8531e8551c07df0c458d8dc4ea5 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Tue, 23 Aug 2016 11:06:07 -0400 Subject: [PATCH 08/16] TST: add test for datetime array and update test TST: fix quarter normalization test TST: change test name BUG: remove arg BUG: look at dict keys TST: add test for windowing MAINT: raise ValueError instead of asserting TST: add assertion to check windowing TST: parametrize test over number of quarters forward/back. BUG: fix adjustment calculation logic for quarter crossovers. TST: add test for previous quarter windows BUG: fix bugs in calculating previous windows BUG: fix missing value for datetime TST: add test case for missing quarter --- tests/pipeline/test_adjusted_array.py | 143 ++++--- tests/pipeline/test_quarters_estimates.py | 196 ++++++++-- zipline/lib/adjustment.pyx | 24 +- zipline/pipeline/loaders/blaze/core.py | 3 +- zipline/pipeline/loaders/events.py | 2 - zipline/pipeline/loaders/quarter_estimates.py | 363 ++++++++++++------ zipline/pipeline/loaders/utils.py | 47 ++- 7 files changed, 552 insertions(+), 226 deletions(-) diff --git a/tests/pipeline/test_adjusted_array.py b/tests/pipeline/test_adjusted_array.py index 80a1dd97..a5bf3ed0 100644 --- a/tests/pipeline/test_adjusted_array.py +++ b/tests/pipeline/test_adjusted_array.py @@ -20,6 +20,7 @@ from toolz import curry from zipline.errors import WindowLengthNotPositive, WindowLengthTooLong from zipline.lib.adjustment import ( Datetime64Overwrite, + Datetime641DArrayOverwrite, Float64Multiply, Float64Overwrite, Float641DArrayOverwrite, @@ -305,7 +306,11 @@ def _gen_overwrite_adjustment_cases(name, ) -def _gen_overwrite_1d_array_adjustment_case(): +def _gen_overwrite_1d_array_adjustment_case(name, + make_input, + make_expected_output, + dtype, + missing_value): """ Generate test cases for overwrite adjustments. @@ -314,90 +319,91 @@ def _gen_overwrite_1d_array_adjustment_case(): the adjustments are expected to modify the arrays. This is parameterized on `make_input` and `make_expected_output` functions, - which take 2-D lists of values and transform them into desired input/output + which take 1-D lists of values and transform them into desired input/output arrays. We do this so that we can easily test both vanilla numpy ndarrays and our own LabelArray class for strings. """ - + adjustment_type = { + float64_dtype: Float641DArrayOverwrite, + datetime64ns_dtype: Datetime641DArrayOverwrite, + }[dtype] adjustments = {} buffer_as_of = [None] * 6 - baseline = as_dtype(float64_dtype, [[2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]]) + baseline = make_input([[2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) - buffer_as_of[0] = as_dtype(float64_dtype, [[2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]]) + buffer_as_of[0] = make_expected_output([[2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + vals1 = [1] # Note that row indices are inclusive! adjustments[1] = [ - Float641DArrayOverwrite(array([0]), - array([0]), - array([0]), - array([0]), - as_dtype(float64_dtype, array([1]))) + adjustment_type( + 0, 0, 0, 0, + array([coerce_to_dtype(dtype, val) for val in vals1]) + ) ] - buffer_as_of[1] = as_dtype(float64_dtype, [[1, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]]) + buffer_as_of[1] = make_input([[1, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) # No adjustment at index 2. buffer_as_of[2] = buffer_as_of[1] + vals3 = [4, 4, 1] adjustments[3] = [ - Float641DArrayOverwrite(array([0, 2, 1]), - array([1, 2, 2]), - array([0, 0, 1]), - array([0, 0, 1]), - as_dtype(float64_dtype, array([4, 1, 3]))) + adjustment_type( + 0, 2, 0, 0, + array([coerce_to_dtype(dtype, val) for val in vals3]) + ) ] - buffer_as_of[3] = as_dtype(float64_dtype, [[4, 2, 2], - [4, 3, 2], - [1, 3, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]]) + buffer_as_of[3] = make_input([[4, 2, 2], + [4, 2, 2], + [1, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) + vals4 = [5] * 4 adjustments[4] = [ - Float641DArrayOverwrite(array([0]), - array([3]), - array([2]), - array([2]), - as_dtype(float64_dtype, array([5]))) + adjustment_type( + 0, 3, 2, 2, + array([coerce_to_dtype(dtype, val) for val in vals4])) ] - buffer_as_of[4] = as_dtype(float64_dtype, [[4, 2, 5], - [4, 3, 5], - [1, 3, 5], - [2, 2, 5], - [2, 2, 2], - [2, 2, 2]]) + buffer_as_of[4] = make_input([[4, 2, 5], + [4, 2, 5], + [1, 2, 5], + [2, 2, 5], + [2, 2, 2], + [2, 2, 2]]) + vals5 = range(1, 6) adjustments[5] = [ - Float641DArrayOverwrite(array([0, 2]), - array([4, 2]), - array([1, 2]), - array([1, 2]), - as_dtype(float64_dtype, array([6, 7]))), + adjustment_type( + 0, 4, 1, 1, + array([coerce_to_dtype(dtype, val) for val in vals5])), ] - buffer_as_of[5] = as_dtype(float64_dtype, [[4, 6, 5], - [4, 6, 5], - [1, 6, 7], - [2, 6, 5], - [2, 6, 2], - [2, 2, 2]]) + buffer_as_of[5] = make_input([[4, 1, 5], + [4, 2, 5], + [1, 3, 5], + [2, 4, 5], + [2, 5, 2], + [2, 2, 2]]) return _gen_expectations( baseline, - default_missing_value_for_dtype(float64_dtype), + missing_value, adjustments, buffer_as_of, nrows=6, @@ -542,7 +548,22 @@ class AdjustedArrayTestCase(TestCase): datetime64ns_dtype, ), ), - _gen_overwrite_1d_array_adjustment_case(), + _gen_overwrite_1d_array_adjustment_case( + 'float', + make_input=as_dtype(float64_dtype), + make_expected_output=as_dtype(float64_dtype), + dtype=float64_dtype, + missing_value=default_missing_value_for_dtype(float64_dtype), + ), + _gen_overwrite_1d_array_adjustment_case( + 'datetime', + make_input=as_dtype(datetime64ns_dtype), + make_expected_output=as_dtype(datetime64ns_dtype), + dtype=datetime64ns_dtype, + missing_value=default_missing_value_for_dtype( + datetime64ns_dtype, + ), + ), # There are six cases here: # Using np.bytes/np.unicode/object arrays as inputs. # Passing np.bytes/np.unicode/object arrays to LabelArray, diff --git a/tests/pipeline/test_quarters_estimates.py b/tests/pipeline/test_quarters_estimates.py index 1f82d161..5129bb5a 100644 --- a/tests/pipeline/test_quarters_estimates.py +++ b/tests/pipeline/test_quarters_estimates.py @@ -1,9 +1,10 @@ import blaze as bz import itertools +from nose_parameterized import parameterized import numpy as np import pandas as pd -from zipline.pipeline import SimplePipelineEngine, Pipeline +from zipline.pipeline import SimplePipelineEngine, Pipeline, CustomFactor from zipline.pipeline.common import ( EVENT_DATE_FIELD_NAME, FISCAL_QUARTER_FIELD_NAME, @@ -18,8 +19,8 @@ from zipline.pipeline.loaders.blaze.estimates import ( ) from zipline.pipeline.loaders.quarter_estimates import ( NextQuartersEstimatesLoader, - PreviousQuartersEstimatesLoader -) + PreviousQuartersEstimatesLoader, + split_normalized_quarters, normalize_quarters) from zipline.testing import ZiplineTestCase from zipline.testing.fixtures import WithAssetFinder, WithTradingSessions from zipline.testing.predicates import assert_equal @@ -31,7 +32,6 @@ class Estimates(DataSet): fiscal_quarter = Column(dtype=float64_dtype) fiscal_year = Column(dtype=float64_dtype) estimate = Column(dtype=float64_dtype) - value = Column(dtype=float64_dtype) def QuartersEstimates(num_qtr): @@ -40,6 +40,28 @@ def QuartersEstimates(num_qtr): name = Estimates return QtrEstimates + +# 0Q1: 2015-01-05.Q1.e1.2015-01-06, 2015-01-10.Q1.e1.2015-01-11, +# 0Q2: 2015-01-15.Q2.e1.2015-01-16, 2015-01-20.Q2.e1.2015-01-21, +# 0Q4: 2015-02-05.Q4.e1.2015-02-06, 2015-02-10.Q4.e1.2015-02-11, +# Skip Q3 to make sure we handle skipped quarter data correctly. +estimates_timeline = pd.DataFrame({ + TS_FIELD_NAME: [pd.Timestamp('2015-01-05'), pd.Timestamp('2015-01-07'), + pd.Timestamp('2015-01-05'), pd.Timestamp('2015-01-17'), + pd.Timestamp('2015-01-05'), pd.Timestamp('2015-01-17'), + pd.Timestamp('2015-01-22'), pd.Timestamp('2015-02-02')], + EVENT_DATE_FIELD_NAME: + [pd.Timestamp('2015-01-10'), pd.Timestamp('2015-01-10'), + pd.Timestamp('2015-01-20'), pd.Timestamp('2015-01-20'), + pd.Timestamp('2015-02-10'), pd.Timestamp('2015-02-10'), + pd.Timestamp('2015-02-10'), pd.Timestamp('2015-02-10')], + 'estimate': [1.]*2 + [2.] * 2 + [4.] * 4, + FISCAL_QUARTER_FIELD_NAME: [1]*2 + [2] * 2 + [4] * 4, + FISCAL_YEAR_FIELD_NAME: [2015]*8, + SID_FIELD_NAME: [0]*8 +}) + + # Final release dates never change. The quarters have very tight date ranges # in order to reduce the number of dates we need to iterate through when # testing. @@ -48,7 +70,6 @@ releases = pd.DataFrame({ EVENT_DATE_FIELD_NAME: [pd.Timestamp('2015-01-15'), pd.Timestamp('2015-01-31')], 'estimate': [0.5, 0.8], - 'value': [0.6, 0.9], FISCAL_QUARTER_FIELD_NAME: [1.0, 2.0], FISCAL_YEAR_FIELD_NAME: [2015.0, 2015.0] }) @@ -70,7 +91,6 @@ q2_release_dates = [pd.Timestamp('2015-01-30'), # One day early estimates = pd.DataFrame({ EVENT_DATE_FIELD_NAME: q1_release_dates + q2_release_dates, 'estimate': [.1, .2, .3, .4], - 'value': [np.NaN, np.NaN, np.NaN, np.NaN], FISCAL_QUARTER_FIELD_NAME: [1.0, 1.0, 2.0, 2.0], FISCAL_YEAR_FIELD_NAME: [2015.0, 2015.0, 2015.0, 2015.0] }) @@ -110,14 +130,12 @@ class EstimateTestCase(WithAssetFinder, @classmethod def init_class_fixtures(cls): - cls.events = gen_estimates() cls.sids = cls.events['sid'].unique() cls.columns = { Estimates.estimate: 'estimate', Estimates.event_date: EVENT_DATE_FIELD_NAME, Estimates.fiscal_quarter: FISCAL_QUARTER_FIELD_NAME, Estimates.fiscal_year: FISCAL_YEAR_FIELD_NAME, - Estimates.value: 'value', } cls.loader = cls.make_loader( events=cls.events, @@ -147,7 +165,138 @@ class EstimateTestCase(WithAssetFinder, ) +window_test_cases = [ + (window_len, start_idx, num_quarters_out) for + (window_len, start_idx), num_quarters_out in + itertools.product( + [[5, pd.Timestamp('2015-01-09').tz_localize('utc')], + [6, pd.Timestamp('2015-01-12').tz_localize('utc')], + [11, pd.Timestamp('2015-01-20').tz_localize('utc')], + [19, pd.Timestamp('2015-01-30').tz_localize('utc')], + [26, pd.Timestamp('2015-02-10').tz_localize('utc')]], + [1, 2, 3, 4]) +] + + +class NextEstimateWindowsTestCase(EstimateTestCase): + events = estimates_timeline + START_DATE = pd.Timestamp('2014-12-31') + END_DATE = pd.Timestamp('2015-02-15') + + @classmethod + def make_loader(cls, events, columns): + return NextQuartersEstimatesLoader(events, columns) + + @parameterized.expand(window_test_cases) + def test_next_estimate_windows_at_quarter_boundaries(self, + window_len, + start_idx, + num_quarters_out): + """ + Tests that we overwrite values with the correct quarter's estimate at + the correct dates. + """ + dataset = QuartersEstimates(num_quarters_out) + + class SomeFactor(CustomFactor): + inputs = [dataset.estimate] + window_length = window_len + + def compute(self, today, assets, out, *inputs): + unique_inputs = np.unique(inputs).tolist() + requested_quarter = None + if (pd.Timestamp('2015-02-10').tz_localize('utc') >= today >= + pd.Timestamp('2015-01-05').tz_localize('utc')): + next_quarter = estimates_timeline[ + estimates_timeline[EVENT_DATE_FIELD_NAME] >= today + ].min()[FISCAL_QUARTER_FIELD_NAME] + requested_quarter = next_quarter + num_quarters_out - 1 + + # If we know something about the requested quarter, assert + # that all our estimates in the window are about that quarter. + if requested_quarter and requested_quarter <= 4 and \ + requested_quarter != 3: + assert np.equal(unique_inputs, requested_quarter).all() + else: + # We don't have any information yet about the next quarter + # or about the requested quarter; in that case, all our + # estimates in the window should be NaN across time. + assert np.isnan(unique_inputs).all() + + engine = SimplePipelineEngine( + lambda x: self.loader, + self.trading_days, + self.asset_finder, + ) + engine.run_pipeline( + Pipeline({'est': SomeFactor()}), + start_date=start_idx, + end_date=self.trading_days[-1], + ) + + +class PreviousEstimateWindowsTestCase(EstimateTestCase): + events = estimates_timeline + START_DATE = pd.Timestamp('2014-12-31') + END_DATE = pd.Timestamp('2015-02-15') + + @classmethod + def make_loader(cls, events, columns): + return PreviousQuartersEstimatesLoader(events, columns) + + @parameterized.expand(window_test_cases) + def test_previous_estimate_windows_at_quarter_boundaries(self, + window_len, + start_idx, + num_quarters_out): + """ + Tests that we overwrite values with the correct quarter's estimate at + the correct dates. + """ + dataset = QuartersEstimates(num_quarters_out) + + class SomeFactor(CustomFactor): + inputs = [dataset.estimate] + window_length = window_len + + def compute(self, today, assets, out, *inputs): + unique_inputs = np.unique(inputs).tolist() + requested_quarter = None + if today >= pd.Timestamp('2015-01-12').tz_localize('utc'): + previous_quarter = estimates_timeline[ + estimates_timeline[EVENT_DATE_FIELD_NAME] <= today + ].max()[FISCAL_QUARTER_FIELD_NAME] + requested_quarter = ( + previous_quarter - (num_quarters_out - 1) + ) + + # If we know something about the requested quarter, assert + # that all our estimates in the window are about that quarter. + if requested_quarter and requested_quarter >= 0 and \ + requested_quarter != 3: + assert np.equal(unique_inputs, requested_quarter).all() + else: + # We don't have any information yet about the previous + # quarter + # or about the requested quarter; in that case, all our + # estimates in the window should be NaN across time. + assert np.isnan(unique_inputs).all() + + engine = SimplePipelineEngine( + lambda x: self.loader, + self.trading_days, + self.asset_finder, + ) + engine.run_pipeline( + Pipeline({'est': SomeFactor()}), + start_date=start_idx, + end_date=self.trading_days[-1], + ) + + class NextEstimateTestCase(EstimateTestCase): + events = gen_estimates() + @classmethod def make_loader(cls, events, columns): return NextQuartersEstimatesLoader(events, columns) @@ -229,6 +378,8 @@ class BlazeNextEstimateLoaderTestCase(NextEstimateTestCase): class PreviousEstimateTestCase(EstimateTestCase): + events = gen_estimates() + @classmethod def make_loader(cls, events, columns): return PreviousQuartersEstimatesLoader(events, columns) @@ -314,26 +465,13 @@ class QuarterShiftTestCase(ZiplineTestCase): This tests, in isolation, quarter calculation logic for shifting quarters backwards/forwards from a starting point. """ - def test_calc_forward_shift(self): + def test_quarter_normalization(self): input_yrs = pd.Series([0] * 4) input_qtrs = pd.Series(range(1, 5)) - expected = pd.DataFrame(([yr, qtr] for yr in range(0, 4) for qtr - in range(1, 5))) - for i in range(0, 8): - years, quarters = shift_quarters(i, input_yrs, input_qtrs) - # Can't use assert_series_equal here with check_names=False - # because that still fails due to name differences. - assert years.equals(expected[i:i+4].reset_index(drop=True)[0]) - assert quarters.equals(expected[i:i+4].reset_index(drop=True)[1]) - - def test_calc_backward_shift(self): - input_yrs = pd.Series([0] * 4) - input_qtrs = pd.Series(range(4, 0, -1)) - expected = pd.DataFrame(([yr, qtr] for yr in range(0, -4, -1) for qtr - in range(4, 0, -1))) - for i in range(0, 8, 1): - years, quarters = shift_quarters(-i, input_yrs, input_qtrs) - # Can't use assert_series_equal here with check_names=False - # because that still fails due to name differences. - assert years.equals(expected[i:i+4].reset_index(drop=True)[0]) - assert quarters.equals(expected[i:i+4].reset_index(drop=True)[1]) + result_years, result_quarters = split_normalized_quarters( + normalize_quarters(input_yrs, input_qtrs) + ) + # Can't use assert_series_equal here with check_names=False + # because that still fails due to name differences. + assert input_yrs.equals(result_years) + assert input_qtrs.equals(result_quarters) diff --git a/zipline/lib/adjustment.pyx b/zipline/lib/adjustment.pyx index 036c5026..8839ece9 100644 --- a/zipline/lib/adjustment.pyx +++ b/zipline/lib/adjustment.pyx @@ -371,18 +371,6 @@ cdef class ArrayAdjustment(Adjustment): Subclasses should inherit and provide a `values` attribute and a `mutate` method. """ - def __init__(self, - int64_t first_row, - int64_t last_row, - int64_t first_col, - int64_t last_col): - super(ArrayAdjustment, self).__init__( - first_row=first_row, - last_row=last_row, - first_col=first_col, - last_col=last_col, - ) - def __repr__(self): return ( "%s(first_row=%d, last_row=%d," @@ -441,7 +429,11 @@ cdef class Float641DArrayOverwrite(ArrayAdjustment): first_col=first_col, last_col=last_col, ) - assert (last_row + 1 - first_row) == len(values) + if last_row + 1 - first_row != len(values): + raise ValueError( + "Mismatch: got %d values for rows starting at index %d and " + "ending at index %d." % (len(values), first_row, last_row) + ) self.values = values cpdef mutate(self, float64_t[:, :] data): @@ -497,7 +489,11 @@ cdef class Datetime641DArrayOverwrite(ArrayAdjustment): first_col=first_col, last_col=last_col, ) - assert (last_row + 1 - first_row) == len(values) + if last_row + 1 - first_row != len(values): + raise ValueError("Mismatch: got %d values for rows starting at" + " index %d and ending at index %d." % ( + len(values), first_row, last_row) + ) self.values = asarray([datetime_to_int(value) for value in values]) cpdef mutate(self, int64_t[:, :] data): diff --git a/zipline/pipeline/loaders/blaze/core.py b/zipline/pipeline/loaders/blaze/core.py index c076029d..24ce3e26 100644 --- a/zipline/pipeline/loaders/blaze/core.py +++ b/zipline/pipeline/loaders/blaze/core.py @@ -178,7 +178,8 @@ from zipline.pipeline.loaders.utils import ( last_in_date_group, normalize_data_query_bounds, normalize_timestamp_to_query_time, - ffill_across_cols) + ffill_across_cols +) from zipline.pipeline.sentinels import NotSpecified from zipline.lib.adjusted_array import AdjustedArray, can_represent_dtype from zipline.lib.adjustment import Float64Overwrite diff --git a/zipline/pipeline/loaders/events.py b/zipline/pipeline/loaders/events.py index cb33b3b2..4249d145 100644 --- a/zipline/pipeline/loaders/events.py +++ b/zipline/pipeline/loaders/events.py @@ -167,7 +167,6 @@ class EventsLoader(PipelineLoader): return {} return self._load_events( - rows=self.events, name_map=self.next_value_columns, indexer=self.next_event_indexer(dates, sids), columns=columns, @@ -181,7 +180,6 @@ class EventsLoader(PipelineLoader): return {} return self._load_events( - rows=self.events, name_map=self.previous_value_columns, indexer=self.previous_event_indexer(dates, sids), columns=columns, diff --git a/zipline/pipeline/loaders/quarter_estimates.py b/zipline/pipeline/loaders/quarter_estimates.py index 7d9ce0c4..6ef8ae7c 100644 --- a/zipline/pipeline/loaders/quarter_estimates.py +++ b/zipline/pipeline/loaders/quarter_estimates.py @@ -1,11 +1,9 @@ from abc import abstractmethod from collections import defaultdict -from functools import partial import numpy as np -from numpy.ma import asarray import pandas as pd from six import viewvalues -from toolz import groupby, curry +from toolz import groupby from zipline.lib.adjusted_array import AdjustedArray from zipline.lib.adjustment import (Datetime641DArrayOverwrite, Float641DArrayOverwrite) @@ -18,10 +16,15 @@ from zipline.pipeline.common import ( TS_FIELD_NAME, ) from zipline.pipeline.loaders.base import PipelineLoader -from zipline.pipeline.loaders.frame import DataFrameLoader from zipline.utils.numpy_utils import datetime64ns_dtype -from zipline.utils.pandas_utils import cross_product -from zipline.pipeline.loaders.utils import last_in_date_group, ffill_across_cols +from zipline.pipeline.loaders.utils import ( + ffill_across_cols, + last_in_date_group +) + +NORMALIZED_QUARTERS = 'normalized_quarters' + +SHIFTED_NORMALIZED_QTRS = 'shifted_normalized_quarters' NEXT_FISCAL_QUARTER = 'next_fiscal_quarter' NEXT_FISCAL_YEAR = 'next_fiscal_year' @@ -101,47 +104,184 @@ class QuarterEstimatesLoader(PipelineLoader): def load_quarters(self, num_quarters, last, dates): pass - def get_adjustments(self, result, col_result, last, + def get_requested_data_for_col(self, stacked_last_per_qtr, idx, dates): + """ + Selects the requested data for each date. + + Parameters + ---------- + stacked_last_per_qtr : pd.DataFrame + The latest estimate known per sid per date per quarter with the + dates, normalized quarter, and sid as the index. + idx : pd.MultiIndex + The index of the row of the requested quarter from each date for + each sid. + dates : pd.DatetimeIndex + The calendar dates for which estimates data is requested. + + Returns + -------- + requested_qtr_data : pd.DataFrame + The DataFrame with final values for the requested quarter for all + columns; `dates` are the index and columns are a MultiIndex with + sids at the top level and the dataset columns on the bottom. + """ + requested_qtr_data = stacked_last_per_qtr.loc[idx] + # We no longer need this in the index, but we do need it as a column + # to calculate adjustments. + requested_qtr_data = requested_qtr_data.reset_index( + SHIFTED_NORMALIZED_QTRS + ) + (requested_qtr_data[FISCAL_YEAR_FIELD_NAME], + requested_qtr_data[FISCAL_QUARTER_FIELD_NAME]) = \ + split_normalized_quarters( + requested_qtr_data[SHIFTED_NORMALIZED_QTRS] + ) + # Move sids into the columns. Once we're left with just dates + # as the index, we can reindex by all dates so that we have a + # value for each calendar date. + requested_qtr_data = requested_qtr_data.unstack( + SID_FIELD_NAME + ).reindex(dates) + return requested_qtr_data + + def get_adjustments(self, + zero_qtr_idx, + requested_qtr_idx, + stacked_last_per_qtr, + last_per_qtr, + dates, column_name, - column, mask, - assets): + column, + mask, + assets, + qtr_crossover_point): + """ + Creates an AdjustedArray from the given estimates data for the given + dates. + + Parameters + ---------- + zero_qtr_idx : pd.MultiIndex + The index of the row of the zeroth (immediately next/previous) + quarter from each date for each sid. + requested_qtr_idx : pd.MultiIndex + The index of the row of the requested quarter from each date for + each sid. + stacked_last_per_qtr : pd.DataFrame + The latest estimate known per sid per date per quarter with the + dates, normalized quarter, and sid as the index. + last_per_qtr : pd.DataFrame + The latest estimate known per sid per date per quarter with + dates as the index and normalized quarter and sid in the columns + MultiIndex; allows easy access to the timeline of estimates + across all dates for a sid for a particular quarter. + dates : pd.DatetimeIndex + The calendar dates for which estimates data is requested. + column_name : string + The name of the column for which the AdjustedArray is being + computed. + column : BoundColumn + The column for which the AdjustedArray is being computed. + mask : + assets : + qtr_crossover_point : + Whether we should use the 'right' or 'left' side when doing + searchsorted on the dates for quarter boundaries. + + Returns + ------- + adjusted_array : AdjustedArray + The array of data and overwrites for the given column. + """ adjustments = defaultdict(list) + requested_qtr_data = self.get_requested_data_for_col( + stacked_last_per_qtr, requested_qtr_idx, dates + ) + zero_qtr_data = stacked_last_per_qtr.loc[zero_qtr_idx] + # We no longer need this in the index, but we do need it as a column + # to calculate adjustments. + zero_qtr_data = zero_qtr_data.reset_index(NORMALIZED_QUARTERS) if column.dtype == datetime64ns_dtype: overwrite = Datetime641DArrayOverwrite + missing_value = np.datetime64('NaT', 'ns') else: overwrite = Float641DArrayOverwrite + missing_value = np.NaN for sid_idx, sid in enumerate(assets): - sid_result = result[result.index.get_level_values( - SID_FIELD_NAME - ) == sid] - sid_result = sid_result.reset_index( - level='shifted_normalized_quarters' - ) # Remove qtrs from index to find shifts - # Figure out where we think quarters are changing. - qtr_shifts = sid_result[ - sid_result['shifted_normalized_quarters'] != - sid_result['shifted_normalized_quarters'].shift(1) + zero_qtr_sid_data = zero_qtr_data[ + zero_qtr_data.index.get_level_values(SID_FIELD_NAME) == sid ] - # Iterate backwards. No adjustment for 1st quarter. - for row_indexer in list(reversed(qtr_shifts.index))[:-1]: - # We want to write the values for this row's quarter over - # everything that comes before this quarter when we are at - # the date before this quarter starts. - qtr_start_idx = last.index.get_loc(row_indexer[0]) - quarter = qtr_shifts.loc[row_indexer][ - 'shifted_normalized_quarters' - ] - adjustments[qtr_start_idx] = \ - [overwrite(0, - qtr_start_idx - 1, # get index date - sid_idx, - sid_idx, - last[column_name, quarter, - sid][:qtr_start_idx].values) - ] + # Determine where quarters are changing for this sid. + qtr_shifts = zero_qtr_sid_data[ + zero_qtr_sid_data[NORMALIZED_QUARTERS] != + zero_qtr_sid_data[NORMALIZED_QUARTERS].shift(1) + ] + # On dates where we don't have any information about quarters, + # we will get nulls, and each of these will be interpreted as + # quarter shifts. We need to remove these here. + qtr_shifts = qtr_shifts[ + qtr_shifts[NORMALIZED_QUARTERS].notnull() + ] + # For the given sid, determine which quarters we have estimates + # for. + quarters_with_estimates_for_sid = last_per_qtr.xs( + sid, axis=1, level=SID_FIELD_NAME + ).groupby(axis=1, level=1).first().columns.values + for row_indexer in list(qtr_shifts.index): + # Find the starting index of the quarter that comes right + # after this row. This isn't the starting index of the + # requested quarter, but simply the date we cross over into a + # new quarter. + qtr_start_idx = dates.searchsorted( + zero_qtr_data.loc[ + row_indexer + ][EVENT_DATE_FIELD_NAME], + side=qtr_crossover_point + ) + + # Only add adjustments if the next quarter starts somewhere in + # our date index for this sid. Our 'next' quarter can never + # start at index 0; a starting index of 0 means that the next + # quarter's event date was NaT. + if 0 < qtr_start_idx < len(dates): + # Find the quarter being requested in the quarter we're + # crossing into. + requested_quarter = requested_qtr_data[ + SHIFTED_NORMALIZED_QTRS + ][sid].iloc[qtr_start_idx] + + # If there are estimates for the requested quarter, + # overwrite all values going up to the starting index of + # that quarter with estimates for that quarter. + if requested_quarter in quarters_with_estimates_for_sid: + adjustments[qtr_start_idx] = \ + [overwrite( + 0, + qtr_start_idx - 1, # overwrite thru last qtr + sid_idx, + sid_idx, + last_per_qtr[column_name, + requested_quarter, + sid][:qtr_start_idx].values)] + # There are no estimates for the quarter. Overwrite all + # values going up to the starting index of that quarter + # with the missing value for this column. + else: + adjustments[qtr_start_idx] = [ + overwrite( + 0, + qtr_start_idx - 1, + sid_idx, + sid_idx, + np.array( + [missing_value] * + len(last_per_qtr.index[:qtr_start_idx])) + ) + ] return AdjustedArray( - col_result.values.astype(column.dtype), + requested_qtr_data[column_name].values.astype(column.dtype), mask, dict(adjustments), column.missing_value, @@ -152,110 +292,101 @@ class QuarterEstimatesLoader(PipelineLoader): # attribute, given that they're created dynamically? groups = groupby(lambda x: x.dataset.num_quarters, columns) groups_columns = dict(groups) - if (pd.Series(groups_columns) < 0).any(): + if (pd.Series(groups_columns.keys()) < 0).any(): raise ValueError("Must pass a number of quarters >= 0") out = {} - date_values = pd.DataFrame({SIMULTATION_DATES: dates}) - # dates column must be of type datetime64[ns] in order for subsequent - # comparisons to work correctly. - date_values[SIMULTATION_DATES] = date_values[ - SIMULTATION_DATES - ].astype('datetime64[ns]') - self.estimates['normalized_quarters'] = normalize_quarters( + self.estimates[NORMALIZED_QUARTERS] = normalize_quarters( self.estimates[FISCAL_YEAR_FIELD_NAME], self.estimates[FISCAL_QUARTER_FIELD_NAME], - ).astype(float) - for num_quarters, columns in groups_columns.iteritems(): - name_map = {c: - self.base_column_name_map[ + ) + for num_quarters, columns in groups_columns.items(): + # The column's dataset is itself dynamic and the mapping we + # actually want is to its dataset's parent's column name. + name_map = {c: self.base_column_name_map[ getattr(c.dataset.__base__, c.name) ] for c in columns} # Determine the last piece of information we know for each column - # on each date in the index. - last = last_in_date_group(self.estimates, True, dates, - assets, - extra_groupers=[ - 'normalized_quarters']) - # Forward fill values for each quarter. - ffill_across_cols(last, columns) - stacked = last.stack(1).stack(1) + # on each date in the index for each sid and quarter. + last_per_qtr = last_in_date_group( + self.estimates, True, dates, assets, + extra_groupers=[NORMALIZED_QUARTERS] + ) - result = self.load_quarters(num_quarters, stacked) + # Forward fill values for each quarter/sid/dataset column. + ffill_across_cols(last_per_qtr, columns) + # Stack quarter and sid into the index. + stacked_last_per_qtr = last_per_qtr.stack([NORMALIZED_QUARTERS, + SID_FIELD_NAME]) + # Set date index name for ease of reference + stacked_last_per_qtr.index.set_names(SIMULTATION_DATES, 0, True) + # Determine which quarter is next/previous for each date. + shifted_qtr_data = self.load_quarters(num_quarters, + stacked_last_per_qtr) + zero_qtr_idx = shifted_qtr_data.index + requested_qtr_idx = shifted_qtr_data.set_index([ + shifted_qtr_data.index.get_level_values( + SIMULTATION_DATES + ), + shifted_qtr_data[SHIFTED_NORMALIZED_QTRS], + shifted_qtr_data.index.get_level_values( + SID_FIELD_NAME + )] + ).index for c in columns: column_name = name_map[c] - col_result = result[ - column_name - ].reset_index(1, drop=True).unstack(1).reindex(dates) - adjusted_array = self.get_adjustments(result, - col_result, - last, + adjusted_array = self.get_adjustments(zero_qtr_idx, + requested_qtr_idx, + stacked_last_per_qtr, + last_per_qtr, + dates, column_name, c, mask, - assets) + assets, + self.qtr_crossover_point) out[c] = adjusted_array return out class NextQuartersEstimatesLoader(QuarterEstimatesLoader): + qtr_crossover_point = 'right' - def load_quarters(self, num_quarters, stacked): + def load_quarters(self, num_quarters, stacked_last_per_qtr): # Filter for releases that are on or after each simulation date and # determine the next quarter by picking out the upcoming release for # each date in the index. - stacked = stacked.sort(EVENT_DATE_FIELD_NAME) - next_releases = stacked.loc[ - stacked[EVENT_DATE_FIELD_NAME] >= stacked.index.get_level_values( - 0 - )].groupby(level=[0, 2]).nth(0) - next_releases[ - 'shifted_normalized_quarters' - ] = next_releases.index.get_level_values( - 'normalized_quarters' + stacked_last_per_qtr = stacked_last_per_qtr.sort( + EVENT_DATE_FIELD_NAME + ) + next_releases_per_date = stacked_last_per_qtr.loc[ + stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] >= + stacked_last_per_qtr.index.get_level_values(SIMULTATION_DATES) + ].groupby(level=[SIMULTATION_DATES, SID_FIELD_NAME]).nth(0) + next_releases_per_date[ + SHIFTED_NORMALIZED_QTRS + ] = next_releases_per_date.index.get_level_values( + NORMALIZED_QUARTERS ) + (num_quarters - 1) - next_releases = next_releases.set_index([ - next_releases.index.get_level_values(0), # dates - 'shifted_normalized_quarters', - next_releases.index.get_level_values(2) # sids - ]) - return stacked.loc[next_releases.index] + return next_releases_per_date class PreviousQuartersEstimatesLoader(QuarterEstimatesLoader): - def __init__(self, - estimates, - columns): - super(PreviousQuartersEstimatesLoader, self).__init__(estimates, - columns) + qtr_crossover_point = 'left' - def load_quarters(self, num_quarters, dates_sids, final_releases_per_qtr): - # Filter for releases that are on or before each simulation date. - eligible_previous_releases = final_releases_per_qtr[ - final_releases_per_qtr[EVENT_DATE_FIELD_NAME] <= - final_releases_per_qtr[SIMULTATION_DATES] - ] - # For each sid, get the latest release. - eligible_previous_releases.sort(EVENT_DATE_FIELD_NAME) - previous_releases = eligible_previous_releases.groupby( - [SIMULTATION_DATES, SID_FIELD_NAME] - ).nth(-1).reset_index() # We use nth here to avoid forward filling - # NaNs, which `last()` will do. - previous_releases = previous_releases.rename(columns={ - FISCAL_YEAR_FIELD_NAME: PREVIOUS_FISCAL_YEAR, - FISCAL_QUARTER_FIELD_NAME: PREVIOUS_FISCAL_QUARTER - }) - # The previous fiscal quarter is already our starting point, - # so we should offset `num_quarters` by 1. - (previous_releases[FISCAL_YEAR_FIELD_NAME], - previous_releases[FISCAL_QUARTER_FIELD_NAME]) = shift_quarters( - -(num_quarters - 1), - previous_releases[PREVIOUS_FISCAL_YEAR], - previous_releases[PREVIOUS_FISCAL_QUARTER], - ) - # Do a left merge to get values for each date. - result = dates_sids.merge(previous_releases, - on=([SIMULTATION_DATES, - SID_FIELD_NAME]), - how='left') - return result + def load_quarters(self, num_quarters, stacked_last_per_qtr): + # Filter for releases that are on or before each simulation date and + # determine the previous quarter by picking out the upcoming release + # for each date in the index. + stacked_last_per_qtr = stacked_last_per_qtr.sort(EVENT_DATE_FIELD_NAME) + previous_releases_per_date = stacked_last_per_qtr.loc[ + stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] <= + stacked_last_per_qtr.index.get_level_values( + SIMULTATION_DATES + )].groupby(level=[SIMULTATION_DATES, SID_FIELD_NAME]).nth(-1) + previous_releases_per_date[ + SHIFTED_NORMALIZED_QTRS + ] = previous_releases_per_date.index.get_level_values( + NORMALIZED_QUARTERS + ) - (num_quarters - 1) + return previous_releases_per_date diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 28e0c90d..5bd91132 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -278,11 +278,41 @@ def check_data_query_args(data_query_time, data_query_tz): def last_in_date_group(df, reindex, dates, assets, have_sids=True, extra_groupers=[]): + """ + Determine the last piece of information known on each date in the date + index for each group. + + Parameters + ---------- + df : pd.DataFrame + The DataFrame containing the data to be grouped. + reindex : bool + Whether or not the DataFrame should be reindexed against the date + index. This will add back any dates to the index that were grouped + away. + dates : pd.DatetimeIndex + The dates to use for grouping and reindexing. + assets : pd.Int64Index + The assets that should be included in the column multiindex. + have_sids : bool + Whether or not the DataFrame has sids. If it does, they will be used + in the groupby. + extra_groupers : list of str + Any extra field names that should be included in the groupby. + + Returns + ------- + last_in_group : pd.DataFrame + A DataFrame with dates as the index and fields used in the groupby as + levels of a multiindex of columns. + + """ idx = dates[dates.searchsorted( df[TS_FIELD_NAME].values.astype('datetime64[D]') )] if have_sids: - idx = [idx, SID_FIELD_NAME] + extra_groupers + idx = [idx, SID_FIELD_NAME] + idx += extra_groupers last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby( idx, @@ -291,8 +321,7 @@ def last_in_date_group(df, reindex, dates, assets, have_sids=True, # For the number of things that we're grouping by (except TS), unstack # the df - for _ in range(len(idx) - 1): - last_in_group = last_in_group.unstack() + last_in_group = last_in_group.unstack([-1, -2]) if reindex: if have_sids: @@ -311,6 +340,18 @@ def last_in_date_group(df, reindex, dates, assets, have_sids=True, def ffill_across_cols(df, columns): + """ + Forward fill values in a DataFrame with special logic to handle cases + that pd.DataFrame.ffill cannot and cast columns to appropriate types. + + Parameters + ---------- + df : pd.DataFrame + The DataFrame to do forward-filling on. + columns : list of BoundColumn + The BoundColumns that correspond to columns in the DataFrame to which + special filling and/or casting logic should be applied. + """ df.ffill(inplace=True) # Fill in missing values specified by each column. This is made From cc07a00d1679cbd6dee9cbf30a72ad33f94a4771 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Mon, 29 Aug 2016 17:48:22 -0400 Subject: [PATCH 09/16] TST: add test for requesting multiple datasets with multiple quarters --- tests/pipeline/test_quarters_estimates.py | 156 +++++++++++++++++++++- 1 file changed, 150 insertions(+), 6 deletions(-) diff --git a/tests/pipeline/test_quarters_estimates.py b/tests/pipeline/test_quarters_estimates.py index 5129bb5a..bb263f22 100644 --- a/tests/pipeline/test_quarters_estimates.py +++ b/tests/pipeline/test_quarters_estimates.py @@ -3,6 +3,7 @@ import itertools from nose_parameterized import parameterized import numpy as np import pandas as pd +from toolz import merge from zipline.pipeline import SimplePipelineEngine, Pipeline, CustomFactor from zipline.pipeline.common import ( @@ -137,10 +138,8 @@ class EstimateTestCase(WithAssetFinder, Estimates.fiscal_quarter: FISCAL_QUARTER_FIELD_NAME, Estimates.fiscal_year: FISCAL_YEAR_FIELD_NAME, } - cls.loader = cls.make_loader( - events=cls.events, - columns=cls.columns - ) + cls.loader = cls.make_loader(cls.events, cls.columns) + cls.ASSET_FINDER_EQUITY_SIDS = list( cls.events[SID_FIELD_NAME].unique() ) @@ -179,9 +178,9 @@ window_test_cases = [ class NextEstimateWindowsTestCase(EstimateTestCase): - events = estimates_timeline START_DATE = pd.Timestamp('2014-12-31') END_DATE = pd.Timestamp('2015-02-15') + events = estimates_timeline @classmethod def make_loader(cls, events, columns): @@ -236,9 +235,9 @@ class NextEstimateWindowsTestCase(EstimateTestCase): class PreviousEstimateWindowsTestCase(EstimateTestCase): - events = estimates_timeline START_DATE = pd.Timestamp('2014-12-31') END_DATE = pd.Timestamp('2015-02-15') + events = estimates_timeline @classmethod def make_loader(cls, events, columns): @@ -364,6 +363,151 @@ class NextEstimateTestCase(EstimateTestCase): self._test_wrong_num_quarters_passed() +class NextEstimateMultipleQuartersTestCase(EstimateTestCase): + events = pd.DataFrame({ + SID_FIELD_NAME: [0] * 2, + TS_FIELD_NAME: [pd.Timestamp('2015-01-01'), + pd.Timestamp('2015-01-06')], + EVENT_DATE_FIELD_NAME: [pd.Timestamp('2015-01-10'), + pd.Timestamp('2015-01-20')], + 'estimate': [1., 2.], + FISCAL_QUARTER_FIELD_NAME: [1, 2], + FISCAL_YEAR_FIELD_NAME: [2015, 2015] + }) + + @classmethod + def make_loader(cls, events, columns): + return NextQuartersEstimatesLoader(events, columns) + + def test_multiple_qtrs_requested(self): + """ + This test asks for datasets that calculate which estimates to + return for multiple quarters out and checks that the returned columns + contain data for the correct number of quarters out. + """ + dataset1 = QuartersEstimates(1) + dataset2 = QuartersEstimates(2) + engine = SimplePipelineEngine( + lambda x: self.loader, + self.trading_days, + self.asset_finder, + ) + + results = engine.run_pipeline( + Pipeline( + merge([{c.name + '1': c.latest for c in dataset1.columns}, + {c.name + '2': c.latest for c in dataset2.columns}]) + ), + start_date=self.trading_days[0], + end_date=self.trading_days[-1], + ) + q1_columns = [col.name + '1' for col in self.columns] + q2_columns = [col.name + '2' for col in self.columns] + + # We now expect a column for 1 quarter out and a column for 2 + # quarters out for each of the dataset columns. + assert np.array_equal(sorted(np.array(q1_columns + q2_columns)), + sorted(results.columns.values)) + + def check_null_range(start_date, stop_date, col_name): + # Make sure that values in the given column/range are all null. + assert ( + results.loc[ + start_date:stop_date + ][col_name].isnull() + ).all() + + def check_values(start_date, end_date, col_name, qtr, event_idx): + # Make sure that values in the given column/range are all equal + # to the value at the given index from the raw data. + assert ( + results.loc[ + start_date:end_date + ][col_name + qtr] == + self.events[col_name][event_idx] + ).all() + + # Although it's painful to check the ranges one by one for different + # columns, it's important to do this so that we have a clear + # understanding of how knowledge/event dates interact and give us + # values for 1Q out and 2Q out. + for col in self.columns: + # 1Q out cols + check_null_range(self.START_DATE, + pd.Timestamp('2014-12-31'), + col.name + '1') + check_values(pd.Timestamp('2015-01-02'), + pd.Timestamp('2015-01-10'), + col.name, + '1', + 0) # First event is our 1Q out + check_values(pd.Timestamp('2015-01-11'), + pd.Timestamp('2015-01-20'), + col.name, + '1', + 1) # Second event becomes our 1Q out + check_null_range(pd.Timestamp('2015-01-21'), + self.END_DATE, + col.name + '1') + + # Fiscal year and quarter are different for 2Q out because even when we + # have no data for 2Q out, we still know which fiscal year/quarter we + # want data for as long as we have data for 1Q out. + for col in set(self.columns.keys()) - {Estimates.fiscal_year, + Estimates.fiscal_quarter}: + # 2Q out cols + check_null_range(self.START_DATE, + pd.Timestamp('2015-01-05'), + col.name + '2') + # We have data for 2Q out when our knowledge of + # the next quarter and the quarter after that + # overlaps and before the next quarter's event + # happens. + check_values(pd.Timestamp('2015-01-06'), + pd.Timestamp('2015-01-10'), + col.name, + '2', + 1) + check_null_range(pd.Timestamp('2015-01-11'), + self.END_DATE, + col.name + '2') + + # Check fiscal year/quarter for 2Q out. + check_null_range(self.START_DATE, + pd.Timestamp('2015-01-01'), + Estimates.fiscal_quarter.name + '2') + check_null_range(self.START_DATE, + pd.Timestamp('2015-01-01'), + Estimates.fiscal_year.name + '2') + # We have a different quarter number than the quarter numbers we have + # in our data for 2Q out, so assert manually. + assert ( + results.loc[ + pd.Timestamp('2015-01-02'):pd.Timestamp('2015-01-10') + ][Estimates.fiscal_quarter.name + '2'] == + 2 + ).all() + assert ( + results.loc[ + pd.Timestamp('2015-01-10'):pd.Timestamp('2015-01-20') + ][Estimates.fiscal_quarter.name + '2'] == + 3 + ).all() + # We have the same fiscal year, 2-15, for 2Q out over the date range of + # interest. + check_values(pd.Timestamp('2015-01-02'), + pd.Timestamp('2015-01-20'), + Estimates.fiscal_year.name, + '2', + 1) + check_null_range(pd.Timestamp('2015-01-21'), + self.END_DATE, + Estimates.fiscal_quarter.name + '2') + check_null_range(pd.Timestamp('2015-01-21'), + self.END_DATE, + Estimates.fiscal_year.name + '2') + + class BlazeNextEstimateLoaderTestCase(NextEstimateTestCase): """ Run the same tests as EventsLoaderTestCase, but using a BlazeEventsLoader. From 2a09160ca803760d1fa2c0e0ce702e5da0fd6d1b Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Tue, 30 Aug 2016 10:13:38 -0400 Subject: [PATCH 10/16] TST: add test to check previous columns w/ multiple qtrs MAINT: pass column to name dict MAINT: make check for invalid num columns py3-compatible --- tests/pipeline/test_adjusted_array.py | 197 ++- tests/pipeline/test_quarters_estimates.py | 1173 ++++++++++------- zipline/lib/adjustment.pyx | 50 +- zipline/pipeline/loaders/blaze/core.py | 9 +- zipline/pipeline/loaders/blaze/estimates.py | 10 +- zipline/pipeline/loaders/blaze/events.py | 18 +- zipline/pipeline/loaders/blaze/utils.py | 13 +- zipline/pipeline/loaders/quarter_estimates.py | 381 ++++-- zipline/pipeline/loaders/utils.py | 32 +- zipline/testing/core.py | 6 + zipline/testing/fixtures.py | 5 +- 11 files changed, 1154 insertions(+), 740 deletions(-) diff --git a/tests/pipeline/test_adjusted_array.py b/tests/pipeline/test_adjusted_array.py index a5bf3ed0..730af952 100644 --- a/tests/pipeline/test_adjusted_array.py +++ b/tests/pipeline/test_adjusted_array.py @@ -202,11 +202,7 @@ def _gen_multiplicative_adjustment_cases(dtype): ) -def _gen_overwrite_adjustment_cases(name, - make_input, - make_expected_output, - dtype, - missing_value): +def _gen_overwrite_adjustment_cases(dtype): """ Generate test cases for overwrite adjustments. @@ -226,6 +222,8 @@ def _gen_overwrite_adjustment_cases(name, unicode_dtype: ObjectOverwrite, object_dtype: ObjectOverwrite, }[dtype] + make_expected_dtype = as_dtype(dtype) + missing_value = default_missing_value_for_dtype(datetime64ns_dtype) if dtype == object_dtype: # When we're testing object dtypes, we expect to have strings, but @@ -237,30 +235,30 @@ def _gen_overwrite_adjustment_cases(name, adjustments = {} buffer_as_of = [None] * 6 - baseline = make_input([[2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]]) + baseline = make_expected_dtype([[2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) - buffer_as_of[0] = make_expected_output([[2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]]) + buffer_as_of[0] = make_expected_dtype([[2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) # Note that row indices are inclusive! adjustments[1] = [ adjustment_type(0, 0, 0, 0, make_overwrite_value(dtype, 1)), ] - buffer_as_of[1] = make_expected_output([[1, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]]) + buffer_as_of[1] = make_expected_dtype([[1, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) # No adjustment at index 2. buffer_as_of[2] = buffer_as_of[1] @@ -269,33 +267,33 @@ def _gen_overwrite_adjustment_cases(name, adjustment_type(1, 2, 1, 1, make_overwrite_value(dtype, 3)), adjustment_type(0, 1, 0, 0, make_overwrite_value(dtype, 4)), ] - buffer_as_of[3] = make_expected_output([[4, 2, 2], - [4, 3, 2], - [2, 3, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]]) + buffer_as_of[3] = make_expected_dtype([[4, 2, 2], + [4, 3, 2], + [2, 3, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) adjustments[4] = [ adjustment_type(0, 3, 2, 2, make_overwrite_value(dtype, 5)) ] - buffer_as_of[4] = make_expected_output([[4, 2, 5], - [4, 3, 5], - [2, 3, 5], - [2, 2, 5], - [2, 2, 2], - [2, 2, 2]]) + buffer_as_of[4] = make_expected_dtype([[4, 2, 5], + [4, 3, 5], + [2, 3, 5], + [2, 2, 5], + [2, 2, 2], + [2, 2, 2]]) adjustments[5] = [ adjustment_type(0, 4, 1, 1, make_overwrite_value(dtype, 6)), adjustment_type(2, 2, 2, 2, make_overwrite_value(dtype, 7)), ] - buffer_as_of[5] = make_expected_output([[4, 6, 5], - [4, 6, 5], - [2, 6, 7], - [2, 6, 5], - [2, 6, 2], - [2, 2, 2]]) + buffer_as_of[5] = make_expected_dtype([[4, 6, 5], + [4, 6, 5], + [2, 6, 7], + [2, 6, 5], + [2, 6, 2], + [2, 2, 2]]) return _gen_expectations( baseline, @@ -306,11 +304,7 @@ def _gen_overwrite_adjustment_cases(name, ) -def _gen_overwrite_1d_array_adjustment_case(name, - make_input, - make_expected_output, - dtype, - missing_value): +def _gen_overwrite_1d_array_adjustment_case(dtype): """ Generate test cases for overwrite adjustments. @@ -327,21 +321,24 @@ def _gen_overwrite_1d_array_adjustment_case(name, float64_dtype: Float641DArrayOverwrite, datetime64ns_dtype: Datetime641DArrayOverwrite, }[dtype] + make_expected_dtype = as_dtype(dtype) + missing_value = default_missing_value_for_dtype(datetime64ns_dtype) + adjustments = {} buffer_as_of = [None] * 6 - baseline = make_input([[2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]]) + baseline = make_expected_dtype([[2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) - buffer_as_of[0] = make_expected_output([[2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]]) + buffer_as_of[0] = make_expected_dtype([[2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) vals1 = [1] # Note that row indices are inclusive! @@ -351,12 +348,12 @@ def _gen_overwrite_1d_array_adjustment_case(name, array([coerce_to_dtype(dtype, val) for val in vals1]) ) ] - buffer_as_of[1] = make_input([[1, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]]) + buffer_as_of[1] = make_expected_dtype([[1, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) # No adjustment at index 2. buffer_as_of[2] = buffer_as_of[1] @@ -368,12 +365,12 @@ def _gen_overwrite_1d_array_adjustment_case(name, array([coerce_to_dtype(dtype, val) for val in vals3]) ) ] - buffer_as_of[3] = make_input([[4, 2, 2], - [4, 2, 2], - [1, 2, 2], - [2, 2, 2], - [2, 2, 2], - [2, 2, 2]]) + buffer_as_of[3] = make_expected_dtype([[4, 2, 2], + [4, 2, 2], + [1, 2, 2], + [2, 2, 2], + [2, 2, 2], + [2, 2, 2]]) vals4 = [5] * 4 adjustments[4] = [ @@ -381,12 +378,12 @@ def _gen_overwrite_1d_array_adjustment_case(name, 0, 3, 2, 2, array([coerce_to_dtype(dtype, val) for val in vals4])) ] - buffer_as_of[4] = make_input([[4, 2, 5], - [4, 2, 5], - [1, 2, 5], - [2, 2, 5], - [2, 2, 2], - [2, 2, 2]]) + buffer_as_of[4] = make_expected_dtype([[4, 2, 5], + [4, 2, 5], + [1, 2, 5], + [2, 2, 5], + [2, 2, 2], + [2, 2, 2]]) vals5 = range(1, 6) adjustments[5] = [ @@ -394,12 +391,12 @@ def _gen_overwrite_1d_array_adjustment_case(name, 0, 4, 1, 1, array([coerce_to_dtype(dtype, val) for val in vals5])), ] - buffer_as_of[5] = make_input([[4, 1, 5], - [4, 2, 5], - [1, 3, 5], - [2, 4, 5], - [2, 5, 2], - [2, 2, 2]]) + buffer_as_of[5] = make_expected_dtype([[4, 1, 5], + [4, 2, 5], + [1, 3, 5], + [2, 4, 5], + [2, 5, 2], + [2, 2, 2]]) return _gen_expectations( baseline, @@ -532,38 +529,10 @@ class AdjustedArrayTestCase(TestCase): @parameterized.expand( chain( - _gen_overwrite_adjustment_cases( - 'float', - make_input=as_dtype(float64_dtype), - make_expected_output=as_dtype(float64_dtype), - dtype=float64_dtype, - missing_value=default_missing_value_for_dtype(float64_dtype), - ), - _gen_overwrite_adjustment_cases( - 'datetime', - make_input=as_dtype(datetime64ns_dtype), - make_expected_output=as_dtype(datetime64ns_dtype), - dtype=datetime64ns_dtype, - missing_value=default_missing_value_for_dtype( - datetime64ns_dtype, - ), - ), - _gen_overwrite_1d_array_adjustment_case( - 'float', - make_input=as_dtype(float64_dtype), - make_expected_output=as_dtype(float64_dtype), - dtype=float64_dtype, - missing_value=default_missing_value_for_dtype(float64_dtype), - ), - _gen_overwrite_1d_array_adjustment_case( - 'datetime', - make_input=as_dtype(datetime64ns_dtype), - make_expected_output=as_dtype(datetime64ns_dtype), - dtype=datetime64ns_dtype, - missing_value=default_missing_value_for_dtype( - datetime64ns_dtype, - ), - ), + _gen_overwrite_adjustment_cases(float64_dtype), + _gen_overwrite_adjustment_cases(datetime64ns_dtype), + _gen_overwrite_1d_array_adjustment_case(float64_dtype), + _gen_overwrite_1d_array_adjustment_case(datetime64ns_dtype), # There are six cases here: # Using np.bytes/np.unicode/object arrays as inputs. # Passing np.bytes/np.unicode/object arrays to LabelArray, diff --git a/tests/pipeline/test_quarters_estimates.py b/tests/pipeline/test_quarters_estimates.py index bb263f22..9217df77 100644 --- a/tests/pipeline/test_quarters_estimates.py +++ b/tests/pipeline/test_quarters_estimates.py @@ -1,8 +1,10 @@ import blaze as bz import itertools +from nose.tools import assert_true from nose_parameterized import parameterized import numpy as np import pandas as pd +from pandas.util.testing import assert_frame_equal from toolz import merge from zipline.pipeline import SimplePipelineEngine, Pipeline, CustomFactor @@ -13,19 +15,26 @@ from zipline.pipeline.common import ( SID_FIELD_NAME, TS_FIELD_NAME, ) -from zipline.pipeline.data import DataSet, Column +from zipline.pipeline.data import DataSet +from zipline.pipeline.data import Column from zipline.pipeline.loaders.blaze.estimates import ( BlazeNextEstimatesLoader, BlazePreviousEstimatesLoader ) from zipline.pipeline.loaders.quarter_estimates import ( NextQuartersEstimatesLoader, + normalize_quarters, PreviousQuartersEstimatesLoader, - split_normalized_quarters, normalize_quarters) -from zipline.testing import ZiplineTestCase -from zipline.testing.fixtures import WithAssetFinder, WithTradingSessions + split_normalized_quarters, +) +from zipline.testing.fixtures import ( + WithAssetFinder, + WithTradingSessions, + ZiplineTestCase, +) from zipline.testing.predicates import assert_equal -from zipline.utils.numpy_utils import datetime64ns_dtype, float64_dtype +from zipline.utils.numpy_utils import datetime64ns_dtype +from zipline.utils.numpy_utils import float64_dtype class Estimates(DataSet): @@ -42,276 +51,267 @@ def QuartersEstimates(num_qtr): return QtrEstimates -# 0Q1: 2015-01-05.Q1.e1.2015-01-06, 2015-01-10.Q1.e1.2015-01-11, -# 0Q2: 2015-01-15.Q2.e1.2015-01-16, 2015-01-20.Q2.e1.2015-01-21, -# 0Q4: 2015-02-05.Q4.e1.2015-02-06, 2015-02-10.Q4.e1.2015-02-11, -# Skip Q3 to make sure we handle skipped quarter data correctly. -estimates_timeline = pd.DataFrame({ - TS_FIELD_NAME: [pd.Timestamp('2015-01-05'), pd.Timestamp('2015-01-07'), - pd.Timestamp('2015-01-05'), pd.Timestamp('2015-01-17'), - pd.Timestamp('2015-01-05'), pd.Timestamp('2015-01-17'), - pd.Timestamp('2015-01-22'), pd.Timestamp('2015-02-02')], - EVENT_DATE_FIELD_NAME: - [pd.Timestamp('2015-01-10'), pd.Timestamp('2015-01-10'), - pd.Timestamp('2015-01-20'), pd.Timestamp('2015-01-20'), - pd.Timestamp('2015-02-10'), pd.Timestamp('2015-02-10'), - pd.Timestamp('2015-02-10'), pd.Timestamp('2015-02-10')], - 'estimate': [1.]*2 + [2.] * 2 + [4.] * 4, - FISCAL_QUARTER_FIELD_NAME: [1]*2 + [2] * 2 + [4] * 4, - FISCAL_YEAR_FIELD_NAME: [2015]*8, - SID_FIELD_NAME: [0]*8 -}) +class WithEstimates(WithTradingSessions, WithAssetFinder): + """ + ZiplineTestCase mixin providing cls.loader and cls.events as class + level fixtures. -# Final release dates never change. The quarters have very tight date ranges -# in order to reduce the number of dates we need to iterate through when -# testing. -releases = pd.DataFrame({ - TS_FIELD_NAME: [pd.Timestamp('2015-01-15'), pd.Timestamp('2015-01-31')], - EVENT_DATE_FIELD_NAME: [pd.Timestamp('2015-01-15'), - pd.Timestamp('2015-01-31')], - 'estimate': [0.5, 0.8], - FISCAL_QUARTER_FIELD_NAME: [1.0, 2.0], - FISCAL_YEAR_FIELD_NAME: [2015.0, 2015.0] -}) + Methods + ------- + make_loader(events, columns) -> PipelineLoader + Method which returns the loader to be used throughout tests. -q1_knowledge_dates = [pd.Timestamp('2015-01-01'), pd.Timestamp('2015-01-04'), - pd.Timestamp('2015-01-08'), pd.Timestamp('2015-01-12')] -q2_knowledge_dates = [pd.Timestamp('2015-01-16'), pd.Timestamp('2015-01-20'), - pd.Timestamp('2015-01-24'), pd.Timestamp('2015-01-28')] -# We want to model the possibility of an estimate predicting a release date -# that doesn't match the actual release. This could be done by dynamically -# generating more combinations with different release dates, but that -# significantly increases the amount of time it takes to run the tests. These -# hard-coded cases are sufficient to know that we can update our beliefs when -# we get new information. -q1_release_dates = [pd.Timestamp('2015-01-15'), - pd.Timestamp('2015-01-16')] # One day late -q2_release_dates = [pd.Timestamp('2015-01-30'), # One day early - pd.Timestamp('2015-01-31')] -estimates = pd.DataFrame({ - EVENT_DATE_FIELD_NAME: q1_release_dates + q2_release_dates, - 'estimate': [.1, .2, .3, .4], - FISCAL_QUARTER_FIELD_NAME: [1.0, 1.0, 2.0, 2.0], - FISCAL_YEAR_FIELD_NAME: [2015.0, 2015.0, 2015.0, 2015.0] -}) + events : pd.DataFrame + The raw events to be used as input to the pipeline loader. + columns : dict[str -> str] + The dictionary mapping the names of BoundColumns to the + associated column name in the events DataFrame. + """ - -def gen_estimates(): - sid_estimates = [] - sid_releases = [] - for sid, (q1e1, q1e2, q2e1, q2e2) in enumerate( - itertools.permutations(q1_knowledge_dates + q2_knowledge_dates, - 4) - ): - # We're assuming that estimates must come before the relevant release. - if (q1e1 < q1e2 and - q2e1 < q2e2 and - q1e1 < q1_release_dates[0] and - q1e2 < q1_release_dates[1]): - sid_estimate = estimates.copy(True) - sid_estimate[TS_FIELD_NAME] = [q1e1, q1e2, q2e1, q2e2] - sid_estimate[SID_FIELD_NAME] = sid - sid_estimates += [sid_estimate] - sid_release = releases.copy(True) - sid_release[SID_FIELD_NAME] = sid_estimate[SID_FIELD_NAME] - sid_releases += [sid_release] - return pd.concat(sid_estimates + sid_releases).reset_index(drop=True) - - -class EstimateTestCase(WithAssetFinder, - WithTradingSessions, - ZiplineTestCase): + # Short window defined in order for test to run faster. START_DATE = pd.Timestamp('2014-12-28') END_DATE = pd.Timestamp('2015-02-03') @classmethod def make_loader(cls, events, columns): - pass + raise NotImplementedError('make_loader') @classmethod def init_class_fixtures(cls): - cls.sids = cls.events['sid'].unique() + cls.sids = cls.events[SID_FIELD_NAME].unique() cls.columns = { - Estimates.estimate: 'estimate', - Estimates.event_date: EVENT_DATE_FIELD_NAME, - Estimates.fiscal_quarter: FISCAL_QUARTER_FIELD_NAME, - Estimates.fiscal_year: FISCAL_YEAR_FIELD_NAME, + Estimates.event_date: 'event_date', + Estimates.fiscal_quarter: 'fiscal_quarter', + Estimates.fiscal_year: 'fiscal_year', + Estimates.estimate: 'estimate' } - cls.loader = cls.make_loader(cls.events, cls.columns) - + cls.loader = cls.make_loader(cls.events, {column.name: val for + column, val in + cls.columns.items()}) cls.ASSET_FINDER_EQUITY_SIDS = list( cls.events[SID_FIELD_NAME].unique() ) cls.ASSET_FINDER_EQUITY_SYMBOLS = [ 's' + str(n) for n in cls.ASSET_FINDER_EQUITY_SIDS ] - super(EstimateTestCase, cls).init_class_fixtures() + super(WithEstimates, cls).init_class_fixtures() + + +class WithWrongNumQuarters(WithEstimates): + """ + ZiplineTestCase mixin providing cls.events as a class level fixture and + defining a test for all inheritors to use. + + Attributes + ---------- + events : pd.DataFrame + A simple DataFrame with columns needed for estimates and a single sid + and no other data. + + Tests + ------ + test_wrong_num_quarters_passed() + Tests that loading with an incorrect quarter number raises an error. + """ + events = pd.DataFrame({SID_FIELD_NAME: 0}, + columns=[SID_FIELD_NAME, + TS_FIELD_NAME, + EVENT_DATE_FIELD_NAME, + FISCAL_QUARTER_FIELD_NAME, + FISCAL_YEAR_FIELD_NAME, + 'estimate'], + index=[0]) + + def test_wrong_num_quarters_passed(self): + dataset = QuartersEstimates(-1) + engine = SimplePipelineEngine( + lambda x: self.loader, + self.trading_days, + self.asset_finder, + ) + p = Pipeline({c.name: c.latest for c in dataset.columns}) - def _test_wrong_num_quarters_passed(self): with self.assertRaises(ValueError): - dataset = QuartersEstimates(-1) - engine = SimplePipelineEngine( - lambda x: self.loader, - self.trading_days, - self.asset_finder, - ) - engine.run_pipeline( - Pipeline({c.name: c.latest for c in dataset.columns}), + p, start_date=self.trading_days[0], end_date=self.trading_days[-1], ) -window_test_cases = [ - (window_len, start_idx, num_quarters_out) for - (window_len, start_idx), num_quarters_out in - itertools.product( - [[5, pd.Timestamp('2015-01-09').tz_localize('utc')], - [6, pd.Timestamp('2015-01-12').tz_localize('utc')], - [11, pd.Timestamp('2015-01-20').tz_localize('utc')], - [19, pd.Timestamp('2015-01-30').tz_localize('utc')], - [26, pd.Timestamp('2015-02-10').tz_localize('utc')]], - [1, 2, 3, 4]) -] - - -class NextEstimateWindowsTestCase(EstimateTestCase): - START_DATE = pd.Timestamp('2014-12-31') - END_DATE = pd.Timestamp('2015-02-15') - events = estimates_timeline - - @classmethod - def make_loader(cls, events, columns): - return NextQuartersEstimatesLoader(events, columns) - - @parameterized.expand(window_test_cases) - def test_next_estimate_windows_at_quarter_boundaries(self, - window_len, - start_idx, - num_quarters_out): - """ - Tests that we overwrite values with the correct quarter's estimate at - the correct dates. - """ - dataset = QuartersEstimates(num_quarters_out) - - class SomeFactor(CustomFactor): - inputs = [dataset.estimate] - window_length = window_len - - def compute(self, today, assets, out, *inputs): - unique_inputs = np.unique(inputs).tolist() - requested_quarter = None - if (pd.Timestamp('2015-02-10').tz_localize('utc') >= today >= - pd.Timestamp('2015-01-05').tz_localize('utc')): - next_quarter = estimates_timeline[ - estimates_timeline[EVENT_DATE_FIELD_NAME] >= today - ].min()[FISCAL_QUARTER_FIELD_NAME] - requested_quarter = next_quarter + num_quarters_out - 1 - - # If we know something about the requested quarter, assert - # that all our estimates in the window are about that quarter. - if requested_quarter and requested_quarter <= 4 and \ - requested_quarter != 3: - assert np.equal(unique_inputs, requested_quarter).all() - else: - # We don't have any information yet about the next quarter - # or about the requested quarter; in that case, all our - # estimates in the window should be NaN across time. - assert np.isnan(unique_inputs).all() - - engine = SimplePipelineEngine( - lambda x: self.loader, - self.trading_days, - self.asset_finder, - ) - engine.run_pipeline( - Pipeline({'est': SomeFactor()}), - start_date=start_idx, - end_date=self.trading_days[-1], - ) - - -class PreviousEstimateWindowsTestCase(EstimateTestCase): - START_DATE = pd.Timestamp('2014-12-31') - END_DATE = pd.Timestamp('2015-02-15') - events = estimates_timeline - +class PreviousWithWrongNumQuarters(WithWrongNumQuarters, + ZiplineTestCase): + """ + Tests that previous quarter loader correctly breaks if an incorrect + number of quarters is passed. + """ @classmethod def make_loader(cls, events, columns): return PreviousQuartersEstimatesLoader(events, columns) - @parameterized.expand(window_test_cases) - def test_previous_estimate_windows_at_quarter_boundaries(self, - window_len, - start_idx, - num_quarters_out): - """ - Tests that we overwrite values with the correct quarter's estimate at - the correct dates. - """ - dataset = QuartersEstimates(num_quarters_out) - - class SomeFactor(CustomFactor): - inputs = [dataset.estimate] - window_length = window_len - - def compute(self, today, assets, out, *inputs): - unique_inputs = np.unique(inputs).tolist() - requested_quarter = None - if today >= pd.Timestamp('2015-01-12').tz_localize('utc'): - previous_quarter = estimates_timeline[ - estimates_timeline[EVENT_DATE_FIELD_NAME] <= today - ].max()[FISCAL_QUARTER_FIELD_NAME] - requested_quarter = ( - previous_quarter - (num_quarters_out - 1) - ) - - # If we know something about the requested quarter, assert - # that all our estimates in the window are about that quarter. - if requested_quarter and requested_quarter >= 0 and \ - requested_quarter != 3: - assert np.equal(unique_inputs, requested_quarter).all() - else: - # We don't have any information yet about the previous - # quarter - # or about the requested quarter; in that case, all our - # estimates in the window should be NaN across time. - assert np.isnan(unique_inputs).all() - - engine = SimplePipelineEngine( - lambda x: self.loader, - self.trading_days, - self.asset_finder, - ) - engine.run_pipeline( - Pipeline({'est': SomeFactor()}), - start_date=start_idx, - end_date=self.trading_days[-1], - ) - - -class NextEstimateTestCase(EstimateTestCase): - events = gen_estimates() +class NextWithWrongNumQuarters(WithWrongNumQuarters, + ZiplineTestCase): + """ + Tests that next quarter loader correctly breaks if an incorrect + number of quarters is passed. + """ @classmethod def make_loader(cls, events, columns): return NextQuartersEstimatesLoader(events, columns) - def test_next_estimates(self): + +class WithEstimatesT0(WithEstimates): + """ + ZiplineTestCase mixin providing cls.events as a class level fixture and + defining a test for all inheritors to use. + + Attributes + ---------- + cls.events : pd.DataFrame + Generated dynamically in order to test inter-leavings of estimates and + event dates for multiple quarters to make sure that we select the + right immediate 'next' or 'previous' quarter relative to each date - + i.e., the right 't0' on the timeline. We care about selecting the + right 't0' because we use that to calculate which quarter's data needs + to be returned for each day. + + Methods + ------- + get_expected_estimate(q1_knowledge, + q2_knowledge, + comparable_date) -> pd.DataFrame + Retrieves the expected estimate given the latest knowledge about each + quarter and the date on which the estimate is being requested. If + there is no expected estimate, returns an empty DataFrame. + + Tests + ------ + test_estimates() + Tests that we get the right 't0' value on each day for each sid and + for each column. + """ + q1_knowledge_dates = [pd.Timestamp('2015-01-01'), + pd.Timestamp('2015-01-04'), + pd.Timestamp('2015-01-08'), + pd.Timestamp('2015-01-12')] + q2_knowledge_dates = [pd.Timestamp('2015-01-16'), + pd.Timestamp('2015-01-20'), + pd.Timestamp('2015-01-24'), + pd.Timestamp('2015-01-28')] + # We want to model the possibility of an estimate predicting a release date + # that doesn't match the actual release. This could be done by dynamically + # generating more combinations with different release dates, but that + # significantly increases the amount of time it takes to run the tests. + # These hard-coded cases are sufficient to know that we can update our + # beliefs when we get new information. + q1_release_dates = [pd.Timestamp('2015-01-15'), + pd.Timestamp('2015-01-16')] # One day late + q2_release_dates = [pd.Timestamp('2015-01-30'), # One day early + pd.Timestamp('2015-01-31')] + + @classmethod + def gen_estimates(cls): """ - The goal of this test is to make sure that we select the right - datapoint as our 'next' w.r.t each date. + In order to determine which estimate we care about for a particular + sid, we need to look at all estimates that we have for that sid and + their associated event dates. + + We define q1 < q2, and thus event1 < event2 since event1 occurs + during q1 and event2 occurs during q2 and we assume that there can + only be 1 event per quarter. We assume that there can be multiple + estimates per quarter leading up to the event. We assume that estimates + will not surpass the relevant event date. We will look at 2 estimates + for an event before the event occurs, since that is the simplest + scenario that covers the interesting edge cases: + - estimate values changing + - a release date changing + - estimates for different quarters interleaving + + Thus, we generate all possible inter-leavings of 2 estimates per + quarter-event where estimate1 < estimate2 and all estimates are < the + relevant event and assign each of these inter-leavings to a + different sid. """ + + sid_estimates = [] + sid_releases = [] + # We want all permutations of 2 knowledge dates per quarter. + it = enumerate( + itertools.permutations(cls.q1_knowledge_dates + + cls.q2_knowledge_dates, + 4) + ) + for sid, (q1e1, q1e2, q2e1, q2e2) in it: + # We're assuming that estimates must come before the relevant + # release. + if (q1e1 < q1e2 and + q2e1 < q2e2 and + # All estimates are < Q2's event, so just constrain Q1 + # estimates. + q1e1 < cls.q1_release_dates[0] and + q1e2 < cls.q1_release_dates[0]): + sid_estimates.append(cls.create_estimates_df(q1e1, + q1e2, + q2e1, + q2e2, + sid)) + sid_releases.append(cls.create_releases_df(sid)) + + return pd.concat(sid_estimates + sid_releases).reset_index(drop=True) + + @classmethod + def create_releases_df(cls, sid): + # Final release dates never change. The quarters have very tight date + # ranges in order to reduce the number of dates we need to iterate + # through when testing. + return pd.DataFrame({ + TS_FIELD_NAME: [pd.Timestamp('2015-01-15'), + pd.Timestamp('2015-01-31')], + EVENT_DATE_FIELD_NAME: [pd.Timestamp('2015-01-15'), + pd.Timestamp('2015-01-31')], + 'estimate': [0.5, 0.8], + FISCAL_QUARTER_FIELD_NAME: [1.0, 2.0], + FISCAL_YEAR_FIELD_NAME: [2015.0, 2015.0], + SID_FIELD_NAME: sid + }) + + @classmethod + def create_estimates_df(cls, + q1e1, + q1e2, + q2e1, + q2e2, + sid): + return pd.DataFrame({ + EVENT_DATE_FIELD_NAME: cls.q1_release_dates + cls.q2_release_dates, + 'estimate': [.1, .2, .3, .4], + FISCAL_QUARTER_FIELD_NAME: [1.0, 1.0, 2.0, 2.0], + FISCAL_YEAR_FIELD_NAME: [2015.0, 2015.0, 2015.0, 2015.0], + TS_FIELD_NAME: [q1e1, q1e2, q2e1, q2e2], + SID_FIELD_NAME: sid, + }) + + @classmethod + def init_class_fixtures(cls): + # Must be generated before call to super since super uses `events`. + cls.events = cls.gen_estimates() + super(WithEstimatesT0, cls).init_class_fixtures() + + def get_expected_estimate(self, + q1_knowledge, + q2_knowledge, + comparable_date): + return pd.DataFrame() + + def test_estimates(self): dataset = QuartersEstimates(1) engine = SimplePipelineEngine( lambda x: self.loader, self.trading_days, self.asset_finder, ) - results = engine.run_pipeline( Pipeline({c.name: c.latest for c in dataset.columns}), start_date=self.trading_days[0], @@ -328,63 +328,179 @@ class NextEstimateTestCase(EstimateTestCase): ts_eligible_estimates = ts_sorted_estimates[ ts_sorted_estimates[TS_FIELD_NAME] <= comparable_date ] - expected_estimate = pd.DataFrame() + # If there are estimates we know about: if not ts_eligible_estimates.empty: + # Determine the last piece of information we know about + # for q1 and q2. This takes advantage of the fact that we + # only have 2 quarters in the test data. q1_knowledge = ts_eligible_estimates[ ts_eligible_estimates[FISCAL_QUARTER_FIELD_NAME] == 1 ] q2_knowledge = ts_eligible_estimates[ ts_eligible_estimates[FISCAL_QUARTER_FIELD_NAME] == 2 ] - - # If our latest knowledge of q1 is that the release is - # happening on this simulation date or later, then that's - # the estimate we want to use. - if (not q1_knowledge.empty and - q1_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] >= - comparable_date): - expected_estimate = q1_knowledge.iloc[-1] - # If q1 has already happened or we don't know about it - # yet and our latest knowledge indicates that q2 hasn't - # happend yet, then that's the estimate we want to use. - elif (not q2_knowledge.empty and - q2_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] >= - comparable_date): - expected_estimate = q2_knowledge.iloc[-1] - if not expected_estimate.empty: - for colname in sid_estimates.columns: - expected_value = expected_estimate[colname] - computed_value = sid_estimates.iloc[i][colname] - assert_equal(expected_value, computed_value) + expected_estimate = self.get_expected_estimate( + q1_knowledge, + q2_knowledge, + comparable_date, + ) + if not expected_estimate.empty: + for colname in sid_estimates.columns: + expected_value = expected_estimate[colname] + computed_value = sid_estimates.iloc[i][colname] + assert_equal(expected_value, computed_value) + else: + # There are no eligible 'next' estimates on this day; + # everything should be null. + assert_true(sid_estimates.iloc[i].isnull().all()) else: - assert sid_estimates.iloc[i].isnull().all() - - def test_wrong_num_quarters_passed(self): - self._test_wrong_num_quarters_passed() + # We don't know about any estimates on this day; + # everything should be null. + assert_true(sid_estimates.iloc[i].isnull().all()) -class NextEstimateMultipleQuartersTestCase(EstimateTestCase): - events = pd.DataFrame({ - SID_FIELD_NAME: [0] * 2, - TS_FIELD_NAME: [pd.Timestamp('2015-01-01'), - pd.Timestamp('2015-01-06')], - EVENT_DATE_FIELD_NAME: [pd.Timestamp('2015-01-10'), - pd.Timestamp('2015-01-20')], - 'estimate': [1., 2.], - FISCAL_QUARTER_FIELD_NAME: [1, 2], - FISCAL_YEAR_FIELD_NAME: [2015, 2015] - }) - +class NextEstimate(WithEstimatesT0, ZiplineTestCase): @classmethod def make_loader(cls, events, columns): return NextQuartersEstimatesLoader(events, columns) + def get_expected_estimate(self, + q1_knowledge, + q2_knowledge, + comparable_date): + # If our latest knowledge of q1 is that the release is + # happening on this simulation date or later, then that's + # the estimate we want to use. + if (not q1_knowledge.empty and + q1_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] >= + comparable_date): + return q1_knowledge.iloc[-1] + # If q1 has already happened or we don't know about it + # yet and our latest knowledge indicates that q2 hasn't + # happened yet, then that's the estimate we want to use. + elif (not q2_knowledge.empty and + q2_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] >= + comparable_date): + return q2_knowledge.iloc[-1] + return pd.DataFrame() + + +class BlazeNextEstimateLoaderTestCase(NextEstimate): + """ + Run the same tests as EventsLoaderTestCase, but using a BlazeEventsLoader. + """ + + @classmethod + def make_loader(cls, events, columns): + return BlazeNextEstimatesLoader( + bz.data(events), + columns, + ) + + +class PreviousEstimate(WithEstimatesT0, ZiplineTestCase): + @classmethod + def make_loader(cls, events, columns): + return PreviousQuartersEstimatesLoader(events, columns) + + def get_expected_estimate(self, + q1_knowledge, + q2_knowledge, + comparable_date): + + # The expected estimate will be for q2 if the last thing + # we've seen is that the release date already happened. + # Otherwise, it'll be for q1, as long as the release date + # for q1 has already happened. + if (not q2_knowledge.empty and + q2_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] <= + comparable_date): + return q2_knowledge.iloc[-1] + elif (not q1_knowledge.empty and + q1_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] <= + comparable_date): + return q1_knowledge.iloc[-1] + return pd.DataFrame() + + +class BlazePreviousEstimateLoaderTestCase(PreviousEstimate): + """ + Run the same tests as EventsLoaderTestCase, but using a BlazeEventsLoader. + """ + + @classmethod + def make_loader(cls, events, columns): + return BlazePreviousEstimatesLoader( + bz.data(events), + columns, + ) + + +class WithEstimateMultipleQuarters(WithEstimates): + """ + ZiplineTestCase mixin providing cls.events, cls.make_expected_out as + class-level fixtures and self.test_multiple_qtrs_requested as a test. + + Attributes + ---------- + events : pd.DataFrame + Simple DataFrame with estimates for 2 quarters for a single sid. + + Methods + ------- + make_expected_out() --> pd.DataFrame + Returns the DataFrame that is expected as a result of running a + Pipeline where estimates are requested for multiple quarters out. + fill_expected_out(expected) + Fills the expected DataFrame with data. + + Tests + ------ + test_multiple_qtrs_requested() + Runs a Pipeline that calculate which estimates for multiple quarters + out and checks that the returned columns contain data for the correct + number of quarters out. + """ + events = pd.DataFrame({ + SID_FIELD_NAME: [0] * 2, + TS_FIELD_NAME: [pd.Timestamp('2015-01-01'), + pd.Timestamp('2015-01-06')], + EVENT_DATE_FIELD_NAME: [pd.Timestamp('2015-01-10'), + pd.Timestamp('2015-01-20')], + 'estimate': [1., 2.], + FISCAL_QUARTER_FIELD_NAME: [1, 2], + FISCAL_YEAR_FIELD_NAME: [2015, 2015] + }) + + @classmethod + def init_class_fixtures(cls): + super(WithEstimateMultipleQuarters, cls).init_class_fixtures() + cls.expected_out = cls.make_expected_out() + + @classmethod + def make_expected_out(cls): + expected = pd.DataFrame(columns=[cls.columns[col] + '1' + for col in cls.columns] + + [cls.columns[col] + '2' + for col in cls.columns], + index=cls.trading_days) + + for (col, raw_name), suffix in itertools.product( + cls.columns.items(), ('1', '2') + ): + expected_name = raw_name + suffix + if col.dtype == datetime64ns_dtype: + expected[expected_name] = pd.to_datetime( + expected[expected_name] + ) + else: + expected[expected_name] = expected[ + expected_name + ].astype(col.dtype) + cls.fill_expected_out(expected) + return expected.reindex(cls.trading_days) + def test_multiple_qtrs_requested(self): - """ - This test asks for datasets that calculate which estimates to - return for multiple quarters out and checks that the returned columns - contain data for the correct number of quarters out. - """ dataset1 = QuartersEstimates(1) dataset2 = QuartersEstimates(2) engine = SimplePipelineEngine( @@ -406,202 +522,363 @@ class NextEstimateMultipleQuartersTestCase(EstimateTestCase): # We now expect a column for 1 quarter out and a column for 2 # quarters out for each of the dataset columns. - assert np.array_equal(sorted(np.array(q1_columns + q2_columns)), - sorted(results.columns.values)) - - def check_null_range(start_date, stop_date, col_name): - # Make sure that values in the given column/range are all null. - assert ( - results.loc[ - start_date:stop_date - ][col_name].isnull() - ).all() - - def check_values(start_date, end_date, col_name, qtr, event_idx): - # Make sure that values in the given column/range are all equal - # to the value at the given index from the raw data. - assert ( - results.loc[ - start_date:end_date - ][col_name + qtr] == - self.events[col_name][event_idx] - ).all() - - # Although it's painful to check the ranges one by one for different - # columns, it's important to do this so that we have a clear - # understanding of how knowledge/event dates interact and give us - # values for 1Q out and 2Q out. - for col in self.columns: - # 1Q out cols - check_null_range(self.START_DATE, - pd.Timestamp('2014-12-31'), - col.name + '1') - check_values(pd.Timestamp('2015-01-02'), - pd.Timestamp('2015-01-10'), - col.name, - '1', - 0) # First event is our 1Q out - check_values(pd.Timestamp('2015-01-11'), - pd.Timestamp('2015-01-20'), - col.name, - '1', - 1) # Second event becomes our 1Q out - check_null_range(pd.Timestamp('2015-01-21'), - self.END_DATE, - col.name + '1') - - # Fiscal year and quarter are different for 2Q out because even when we - # have no data for 2Q out, we still know which fiscal year/quarter we - # want data for as long as we have data for 1Q out. - for col in set(self.columns.keys()) - {Estimates.fiscal_year, - Estimates.fiscal_quarter}: - # 2Q out cols - check_null_range(self.START_DATE, - pd.Timestamp('2015-01-05'), - col.name + '2') - # We have data for 2Q out when our knowledge of - # the next quarter and the quarter after that - # overlaps and before the next quarter's event - # happens. - check_values(pd.Timestamp('2015-01-06'), - pd.Timestamp('2015-01-10'), - col.name, - '2', - 1) - check_null_range(pd.Timestamp('2015-01-11'), - self.END_DATE, - col.name + '2') - - # Check fiscal year/quarter for 2Q out. - check_null_range(self.START_DATE, - pd.Timestamp('2015-01-01'), - Estimates.fiscal_quarter.name + '2') - check_null_range(self.START_DATE, - pd.Timestamp('2015-01-01'), - Estimates.fiscal_year.name + '2') - # We have a different quarter number than the quarter numbers we have - # in our data for 2Q out, so assert manually. - assert ( - results.loc[ - pd.Timestamp('2015-01-02'):pd.Timestamp('2015-01-10') - ][Estimates.fiscal_quarter.name + '2'] == - 2 - ).all() - assert ( - results.loc[ - pd.Timestamp('2015-01-10'):pd.Timestamp('2015-01-20') - ][Estimates.fiscal_quarter.name + '2'] == - 3 - ).all() - # We have the same fiscal year, 2-15, for 2Q out over the date range of - # interest. - check_values(pd.Timestamp('2015-01-02'), - pd.Timestamp('2015-01-20'), - Estimates.fiscal_year.name, - '2', - 1) - check_null_range(pd.Timestamp('2015-01-21'), - self.END_DATE, - Estimates.fiscal_quarter.name + '2') - check_null_range(pd.Timestamp('2015-01-21'), - self.END_DATE, - Estimates.fiscal_year.name + '2') + assert_equal(sorted(np.array(q1_columns + q2_columns)), + sorted(results.columns.values)) + assert_frame_equal(self.expected_out.sort(axis=1), + results.xs(0, level=1).sort(axis=1)) -class BlazeNextEstimateLoaderTestCase(NextEstimateTestCase): - """ - Run the same tests as EventsLoaderTestCase, but using a BlazeEventsLoader. - """ - +class NextEstimateMultipleQuarters( + WithEstimateMultipleQuarters, ZiplineTestCase +): @classmethod def make_loader(cls, events, columns): - return BlazeNextEstimatesLoader( - bz.data(events), - columns, - ) + return NextQuartersEstimatesLoader(events, columns) + + @classmethod + def fill_expected_out(cls, expected): + # Fill columns for 1 Q out + for raw_name in cls.columns.values(): + expected[raw_name + '1'].loc[ + pd.Timestamp('2015-01-01'):pd.Timestamp('2015-01-11') + ] = cls.events[raw_name].iloc[0] + expected[raw_name + '1'].loc[ + pd.Timestamp('2015-01-11'):pd.Timestamp('2015-01-20') + ] = cls.events[raw_name].iloc[1] + + # Fill columns for 2 Q out + # We only have an estimate and event date for 2 quarters out before + # Q1's event happens; after Q1's event, we know 1 Q out but not 2 Qs + # out. + for col_name in ['estimate', 'event_date']: + expected[col_name + '2'].loc[ + pd.Timestamp('2015-01-06'):pd.Timestamp('2015-01-10') + ] = cls.events[col_name].iloc[1] + # But we know what FQ and FY we'd need in both Q1 and Q2 + # because we know which FQ is next and can calculate from there + expected[FISCAL_QUARTER_FIELD_NAME + '2'].loc[ + pd.Timestamp('2015-01-01'):pd.Timestamp('2015-01-09') + ] = 2 + expected[FISCAL_QUARTER_FIELD_NAME + '2'].loc[ + pd.Timestamp('2015-01-12'):pd.Timestamp('2015-01-20') + ] = 3 + expected[FISCAL_YEAR_FIELD_NAME + '2'].loc[ + pd.Timestamp('2015-01-01'):pd.Timestamp('2015-01-20') + ] = 2015 + + return expected -class PreviousEstimateTestCase(EstimateTestCase): - events = gen_estimates() +class PreviousEstimateMultipleQuarters( + WithEstimateMultipleQuarters, + ZiplineTestCase +): @classmethod def make_loader(cls, events, columns): return PreviousQuartersEstimatesLoader(events, columns) - def test_previous_estimates(self): + @classmethod + def fill_expected_out(cls, expected): + # Fill columns for 1 Q out + for raw_name in cls.columns.values(): + expected[raw_name + '1'].loc[ + pd.Timestamp('2015-01-12'):pd.Timestamp('2015-01-19') + ] = cls.events[raw_name].iloc[0] + expected[raw_name + '1'].loc[ + pd.Timestamp('2015-01-20'): + ] = cls.events[raw_name].iloc[1] + + # Fill columns for 2 Q out + for col_name in ['estimate', 'event_date']: + expected[col_name + '2'].loc[ + pd.Timestamp('2015-01-20'): + ] = cls.events[col_name].iloc[0] + expected[ + FISCAL_QUARTER_FIELD_NAME + '2' + ].loc[pd.Timestamp('2015-01-12'):pd.Timestamp('2015-01-20')] = 4 + expected[ + FISCAL_YEAR_FIELD_NAME + '2' + ].loc[pd.Timestamp('2015-01-12'):pd.Timestamp('2015-01-20')] = 2014 + expected[ + FISCAL_QUARTER_FIELD_NAME + '2' + ].loc[pd.Timestamp('2015-01-20'):] = 1 + expected[ + FISCAL_YEAR_FIELD_NAME + '2' + ].loc[pd.Timestamp('2015-01-20'):] = 2015 + return expected + + +class WithEstimateWindows(WithEstimates): + """ + ZiplineTestCase mixin providing fixures and a test to test running a + Pipeline with an estimates loader over differently-sized windows. + + Attributes + ---------- + events : pd.DataFrame + DataFrame with estimates for 2 quarters for 2 sids. + window_test_start_date : pd.Timestamp + The date from which the window should start. + timelines : dict[int -> pd.DataFrame] + A dictionary mapping to the number of quarters out to + snapshots of how the data should look on each date in the date range. + + Methods + ------- + make_expected_timelines() -> dict[int -> pd.DataFrame] + Creates a dictionary of expected data. See `timelines`, above. + + Tests + ----- + test_estimate_windows_at_quarter_boundaries() + Tests that we overwrite values with the correct quarter's estimate at + the correct dates when we have a factor that asks for a window of data. + """ + sid_0_timeline = pd.DataFrame({ + TS_FIELD_NAME: [pd.Timestamp('2015-01-05'), + pd.Timestamp('2015-01-07'), + pd.Timestamp('2015-01-05'), + pd.Timestamp('2015-01-17')], + EVENT_DATE_FIELD_NAME: + [pd.Timestamp('2015-01-10'), + pd.Timestamp('2015-01-10'), + pd.Timestamp('2015-01-20'), + pd.Timestamp('2015-01-20')], + 'estimate': [10., 11.] + [20., 21.], + FISCAL_QUARTER_FIELD_NAME: [1] * 2 + [2] * 2, + FISCAL_YEAR_FIELD_NAME: 2015, + SID_FIELD_NAME: 0, + }) + + sid_1_timeline = pd.DataFrame({ + TS_FIELD_NAME: [pd.Timestamp('2015-01-09'), + pd.Timestamp('2015-01-12'), + pd.Timestamp('2015-01-09'), + pd.Timestamp('2015-01-15')], + EVENT_DATE_FIELD_NAME: + [pd.Timestamp('2015-01-12'), pd.Timestamp('2015-01-12'), + pd.Timestamp('2015-01-15'), pd.Timestamp('2015-01-15')], + 'estimate': [10., 11.] + [30., 31.], + FISCAL_QUARTER_FIELD_NAME: [1] * 2 + [3] * 2, + FISCAL_YEAR_FIELD_NAME: 2015, + SID_FIELD_NAME: 1 + }) + + window_test_start_date = pd.Timestamp('2015-01-05') + critical_dates = [pd.Timestamp('2015-01-09', tz='utc'), + pd.Timestamp('2015-01-12', tz='utc'), + pd.Timestamp('2015-01-15', tz='utc'), + pd.Timestamp('2015-01-20', tz='utc')] + # window length, starting date, num quarters out, timeline. Parameterizes + # over number of quarters out. + window_test_cases = list(itertools.product(critical_dates, (1, 2))) + events = pd.concat([sid_0_timeline, sid_1_timeline]) + + @classmethod + def make_expected_timelines(cls): + return {} + + @classmethod + def init_class_fixtures(cls): + super(WithEstimateWindows, cls).init_class_fixtures() + cls.timelines = cls.make_expected_timelines() + + @classmethod + def create_expected_df(cls, tuples, end_date): """ - The goal of this test is to make sure that we select the right - datapoint as our 'previous' w.r.t each date. + Given a list of tuples of new data we get for each sid on each critical + date (when information changes), create a DataFrame that fills that + data through a date range ending at `end_date`. """ - dataset = QuartersEstimates(1) + df = pd.DataFrame(tuples, + columns=[SID_FIELD_NAME, + 'estimate', + 'knowledge_date']) + df = df.pivot_table(columns='sid', + values='estimate', + index='knowledge_date') + df = df.reindex( + pd.date_range(cls.window_test_start_date, end_date) + ) + # Index name is lost during reindex. + df.index = df.index.rename('knowledge_date') + df['at_date'] = end_date.tz_localize('utc') + df = df.set_index(['at_date', df.index.tz_localize('utc')]).ffill() + return df + + @parameterized.expand(window_test_cases) + def test_estimate_windows_at_quarter_boundaries(self, + start_idx, + num_quarters_out): + dataset = QuartersEstimates(num_quarters_out) + trading_days = self.trading_days + timelines = self.timelines + # The window length should be from the starting index back to the first + # date on which we got data. The goal is to ensure that as we + # progress through the timeline, all data we got, starting from that + # first date, is correctly overwritten. + window_len = ( + self.trading_days.get_loc(start_idx) - + self.trading_days.get_loc(self.window_test_start_date) + 1 + ) + + class SomeFactor(CustomFactor): + inputs = [dataset.estimate] + window_length = window_len + + def compute(self, today, assets, out, estimate): + today_idx = trading_days.get_loc(today) + today_timeline = timelines[ + num_quarters_out + ].loc[today].reindex( + trading_days[:today_idx + 1] + ).values + timeline_start_idx = (len(today_timeline) - window_len) + assert_equal(estimate, + today_timeline[timeline_start_idx:]) engine = SimplePipelineEngine( lambda x: self.loader, self.trading_days, self.asset_finder, ) - - results = engine.run_pipeline( - Pipeline({c.name: c.latest for c in dataset.columns}), - start_date=self.trading_days[0], - end_date=self.trading_days[-1], + engine.run_pipeline( + Pipeline({'est': SomeFactor()}), + start_date=start_idx, + end_date=pd.Timestamp('2015-01-20', tz='utc'), # last event date + # we have ) - for sid in self.sids: - sid_estimates = results.xs(sid, level=1) - ts_sorted_estimates = self.events[ - self.events[SID_FIELD_NAME] == sid - ].sort(TS_FIELD_NAME) - for i, date in enumerate(sid_estimates.index): - comparable_date = date.tz_localize(None) - # Filter out estimates we don't know about yet. - ts_eligible_estimates = ts_sorted_estimates[ - ts_sorted_estimates[TS_FIELD_NAME] <= comparable_date - ] - expected_estimate = pd.DataFrame() - if not ts_eligible_estimates.empty: - # Determine the last piece of information we know about - # for q1 and q2. This takes advantage of the fact that we - # only have 2 quarters in the test data. - q1_knowledge = ts_eligible_estimates[ - ts_eligible_estimates[FISCAL_QUARTER_FIELD_NAME] == 1 - ] - q2_knowledge = ts_eligible_estimates[ - ts_eligible_estimates[FISCAL_QUARTER_FIELD_NAME] == 2 - ] - # The expected estimate will be for q2 if the last thing - # we've seen is that the release date already happened. - # Otherwise, it'll be for q1, as long as the release date - # for q1 has already happened. - if (not q2_knowledge.empty and - q2_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] <= - comparable_date): - expected_estimate = q2_knowledge.iloc[-1] - elif (not q1_knowledge.empty and - q1_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] <= - comparable_date): - expected_estimate = q1_knowledge.iloc[-1] - if not expected_estimate.empty: - for colname in sid_estimates.columns: - expected_value = expected_estimate[colname] - computed_value = sid_estimates.iloc[i][colname] - assert_equal(expected_value, computed_value) - else: - assert sid_estimates.iloc[i].isnull().all() - - def test_wrong_num_quarters_passed(self): - self._test_wrong_num_quarters_passed() -class BlazePreviousEstimateLoaderTestCase(PreviousEstimateTestCase): - """ - Run the same tests as EventsLoaderTestCase, but using a BlazeEventsLoader. - """ - +class PreviousEstimateWindows(WithEstimateWindows, ZiplineTestCase): @classmethod def make_loader(cls, events, columns): - return BlazePreviousEstimatesLoader( - bz.data(events), - columns, + return PreviousQuartersEstimatesLoader(events, columns) + + @classmethod + def make_expected_timelines(cls): + oneq_previous = pd.concat([ + cls.create_expected_df( + [(0, np.NaN, cls.window_test_start_date), + (1, np.NaN, cls.window_test_start_date)], + pd.Timestamp('2015-01-09') + ), + cls.create_expected_df( + [(0, 11, pd.Timestamp('2015-01-10')), + (1, 11, pd.Timestamp('2015-01-12'))], + pd.Timestamp('2015-01-12') + ), + cls.create_expected_df( + [(0, 11, pd.Timestamp('2015-01-10')), + (1, 11, pd.Timestamp('2015-01-12'))], + pd.Timestamp('2015-01-13') + ), + cls.create_expected_df( + [(0, 11, pd.Timestamp('2015-01-10')), + (1, 11, pd.Timestamp('2015-01-12'))], + pd.Timestamp('2015-01-14') + ), + cls.create_expected_df( + [(0, 11, pd.Timestamp('2015-01-10')), + (1, 31, pd.Timestamp('2015-01-15'))], + pd.Timestamp('2015-01-15') + ), + cls.create_expected_df( + [(0, 11, pd.Timestamp('2015-01-10')), + (1, 31, pd.Timestamp('2015-01-15'))], + pd.Timestamp('2015-01-16') + ), + cls.create_expected_df( + [(0, 21, pd.Timestamp('2015-01-17')), + (1, 31, pd.Timestamp('2015-01-15'))], + pd.Timestamp('2015-01-20') + ), + ]) + + twoq_previous = pd.concat( + [cls.create_expected_df( + [(0, np.NaN, cls.window_test_start_date), + (1, np.NaN, cls.window_test_start_date)], + end_date + ) for end_date in pd.date_range('2015-01-09', '2015-01-19')] + + [cls.create_expected_df( + [(0, 11, pd.Timestamp('2015-01-20')), + (1, np.NaN, cls.window_test_start_date)], + pd.Timestamp('2015-01-20') + )] ) + return { + 1: oneq_previous, + 2: twoq_previous + } + + +class NextEstimateWindows(WithEstimateWindows, ZiplineTestCase): + @classmethod + def make_loader(cls, events, columns): + return NextQuartersEstimatesLoader(events, columns) + + @classmethod + def make_expected_timelines(cls): + oneq_next = pd.concat([ + cls.create_expected_df( + [(0, 10, cls.window_test_start_date), + (0, 11, pd.Timestamp('2015-01-07')), + (1, 10, pd.Timestamp('2015-01-09'))], + pd.Timestamp('2015-01-09') + ), + cls.create_expected_df( + [(0, 20, cls.window_test_start_date), + (1, 10, pd.Timestamp('2015-01-09')), + (1, 11, pd.Timestamp('2015-01-12'))], + pd.Timestamp('2015-01-12') + ), + cls.create_expected_df( + [(0, 20, cls.window_test_start_date), + (1, 30, pd.Timestamp('2015-01-09'))], + pd.Timestamp('2015-01-13') + ), + cls.create_expected_df( + [(0, 20, cls.window_test_start_date), + (1, 30, pd.Timestamp('2015-01-09'))], + pd.Timestamp('2015-01-14') + ), + cls.create_expected_df( + [(0, 20, cls.window_test_start_date), + (1, 30, pd.Timestamp('2015-01-09')), + (1, 31, pd.Timestamp('2015-01-15'))], + pd.Timestamp('2015-01-15') + ), + cls.create_expected_df( + [(0, 20, cls.window_test_start_date), + (1, np.NaN, cls.window_test_start_date)], + pd.Timestamp('2015-01-16') + ), + cls.create_expected_df( + [(0, 20, cls.window_test_start_date), + (0, 21, pd.Timestamp('2015-01-17')), + (1, np.NaN, cls.window_test_start_date)], + pd.Timestamp('2015-01-20') + ), + ]) + + twoq_next = pd.concat( + [cls.create_expected_df( + [(0, 20, pd.Timestamp(cls.window_test_start_date)), + (1, np.NaN, pd.Timestamp(cls.window_test_start_date))], + pd.Timestamp('2015-01-09') + )] + + [cls.create_expected_df( + [(0, np.NaN, pd.Timestamp(cls.window_test_start_date)), + (1, np.NaN, pd.Timestamp(cls.window_test_start_date))], + end_date + ) for end_date in pd.date_range('2015-01-12', '2015-01-20')] + ) + + return { + 1: oneq_next, + 2: twoq_next + } class QuarterShiftTestCase(ZiplineTestCase): @@ -610,12 +887,12 @@ class QuarterShiftTestCase(ZiplineTestCase): backwards/forwards from a starting point. """ def test_quarter_normalization(self): - input_yrs = pd.Series([0] * 4) - input_qtrs = pd.Series(range(1, 5)) + input_yrs = pd.Series([0] * 4, dtype=np.int64) + input_qtrs = pd.Series(range(1, 5), dtype=np.int64) result_years, result_quarters = split_normalized_quarters( normalize_quarters(input_yrs, input_qtrs) ) # Can't use assert_series_equal here with check_names=False # because that still fails due to name differences. - assert input_yrs.equals(result_years) - assert input_qtrs.equals(result_quarters) + assert_equal(input_yrs, result_years) + assert_equal(input_qtrs, result_quarters) diff --git a/zipline/lib/adjustment.pyx b/zipline/lib/adjustment.pyx index 8839ece9..aea8df5c 100644 --- a/zipline/lib/adjustment.pyx +++ b/zipline/lib/adjustment.pyx @@ -3,7 +3,7 @@ from cpython cimport Py_EQ from pandas import isnull, Timestamp from numpy cimport float64_t, uint8_t, int64_t -from numpy import asarray, datetime64, float64 +from numpy import asarray, datetime64, float64, int64 # Purely for readability. There aren't C-level declarations for these types. ctypedef object Int64Index_t ctypedef object DatetimeIndex_t @@ -451,28 +451,32 @@ cdef class Datetime641DArrayOverwrite(ArrayAdjustment): Example ------- - >>> import numpy as np - >>> arr = np.arange(25, dtype=float).reshape(5, 5) - >>> arr - array([[ 0., 1., 2., 3., 4.], - [ 5., 6., 7., 8., 9.], - [ 10., 11., 12., 13., 14.], - [ 15., 16., 17., 18., 19.], - [ 20., 21., 22., 23., 24.]]) + >>> import numpy as np; import pandas as pd + >>> dts = pd.date_range('2014', freq='D', periods=9, tz='UTC') + >>> arr = dts.values.reshape(3, 3) + >>> arr == np.datetime64(0, 'ns') + array([[False, False, False], + [False, False, False], + [False, False, False]], dtype=bool) >>> adj = Datetime641DArrayOverwrite( - ... row_start=0, - ... row_end=3, - ... column_start=0, - ... column_end=0, - ... values=np.array([1, 2, 3, 4]), - ) - >>> adj.mutate(arr) - >>> arr - array([[ 1., 1., 2., 3., 4.], - [ 2., 6., 7., 8., 9.], - [ 3., 11., 12., 13., 14.], - [ 4., 16., 17., 18., 19.], - [ 20., 21., 22., 23., 24.]]) + ... first_row=1, + ... last_row=2, + ... first_col=1, + ... last_col=2, + ... values=np.array([ + ... np.datetime64(0, 'ns'), + ... np.datetime64(1, 'ns') + ... ]) + ... ) + >>> adj.mutate(arr.view(np.int64)) + >>> arr == np.datetime64(0, 'ns') + array([[False, False, False], + [False, True, True], + [False, False, False]], dtype=bool) + >>> arr == np.datetime64(1, 'ns') + array([[False, False, False], + [False, False, False], + [False, True, True]], dtype=bool) """ cdef: readonly int64_t[:] values @@ -598,7 +602,7 @@ cdef datetime_to_int(object datetimelike): datetimelike.dtype.name, ) - return datetimelike.astype(int) + return datetimelike.astype(int64) cdef class Datetime64Adjustment(_Int64Adjustment): diff --git a/zipline/pipeline/loaders/blaze/core.py b/zipline/pipeline/loaders/blaze/core.py index 24ce3e26..5264a954 100644 --- a/zipline/pipeline/loaders/blaze/core.py +++ b/zipline/pipeline/loaders/blaze/core.py @@ -1096,12 +1096,15 @@ class BlazeLoader(dict): sparse_deltas = last_in_date_group(non_novel_deltas, dates, assets, - reindex=False) + reindex=False, + have_sids=have_sids) dense_output = last_in_date_group(sparse_output, dates, assets, - reindex=True) - ffill_across_cols(dense_output, columns) + reindex=True, + have_sids=have_sids) + ffill_across_cols(dense_output, columns, {c.name: c.name + for c in columns}) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity diff --git a/zipline/pipeline/loaders/blaze/estimates.py b/zipline/pipeline/loaders/blaze/estimates.py index 81554b0d..864b05a7 100644 --- a/zipline/pipeline/loaders/blaze/estimates.py +++ b/zipline/pipeline/loaders/blaze/estimates.py @@ -25,6 +25,8 @@ class BlazeEstimatesLoader(PipelineLoader): ---------- expr : Expr The expression representing the data to load. + columns : dict[str -> str] + A dict mapping BoundColumn names to the associated names in `expr`. resources : dict, optional Mapping from the loadable terms of ``expr`` to actual data resources. odo_kwargs : dict, optional @@ -33,8 +35,6 @@ class BlazeEstimatesLoader(PipelineLoader): The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezeone to use for the data query cutoff. - dataset : DataSet - The DataSet object for which this loader loads data. Notes ----- @@ -43,12 +43,14 @@ class BlazeEstimatesLoader(PipelineLoader): Dim * {{ {SID_FIELD_NAME}: int64, {TS_FIELD_NAME}: datetime, + {FISCAL_YEAR_FIELD_NAME}: float64, + {FISCAL_QUARTER_FIELD_NAME}: float64, + {EVENT_DATE_FIELD_NAME}: datetime, }} And other dataset-specific fields, where each row of the table is a record including the sid to identify the company, the timestamp where we - learned about the announcement, and the date when the earnings will be - announced. + learned about the announcement, and the date of the event. If the '{TS_FIELD_NAME}' field is not included it is assumed that we start the backtest with knowledge of all announcements. diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index 39e5dac6..56c050f2 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -24,6 +24,10 @@ class BlazeEventsLoader(PipelineLoader): ---------- expr : Expr The expression representing the data to load. + next_value_columns : dict[BoundColumn -> raw column name] + A dict mapping 'next' BoundColumns to their column names in `expr`. + previous_value_columns : dict[BoundColumn -> raw column name] + A dict mapping 'previous' BoundColumns to their column names in `expr`. resources : dict, optional Mapping from the loadable terms of ``expr`` to actual data resources. odo_kwargs : dict, optional @@ -32,8 +36,6 @@ class BlazeEventsLoader(PipelineLoader): The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezone to use for the data query cutoff. - dataset : DataSet - The DataSet object for which this loader loads data. Notes ----- @@ -42,12 +44,12 @@ class BlazeEventsLoader(PipelineLoader): Dim * {{ {SID_FIELD_NAME}: int64, {TS_FIELD_NAME}: datetime, + {EVENT_DATE_FIELD_NAME}: datetime, }} And other dataset-specific fields, where each row of the table is a record including the sid to identify the company, the timestamp where we - learned about the announcement, and the date when the earnings will be z - announced. + learned about the announcement, and the event date. If the '{TS_FIELD_NAME}' field is not included it is assumed that we start the backtest with knowledge of all announcements. @@ -84,8 +86,12 @@ class BlazeEventsLoader(PipelineLoader): self._data_query_tz = data_query_tz def load_adjusted_array(self, columns, dates, assets, mask): - raw = load_raw_data(assets, dates, self._data_query_time, - self._data_query_tz, self._expr, self._odo_kwargs) + raw = load_raw_data(assets, + dates, + self._data_query_time, + self._data_query_tz, + self._expr, + self._odo_kwargs) return EventsLoader( events=raw, diff --git a/zipline/pipeline/loaders/blaze/utils.py b/zipline/pipeline/loaders/blaze/utils.py index 6455f76c..963d9f9a 100644 --- a/zipline/pipeline/loaders/blaze/utils.py +++ b/zipline/pipeline/loaders/blaze/utils.py @@ -6,7 +6,11 @@ from zipline.pipeline.loaders.utils import ( ) -def load_raw_data(assets, dates, data_query_time, data_query_tz, expr, +def load_raw_data(assets, + dates, + data_query_time, + data_query_tz, + expr, odo_kwargs): """ given an expression representing data to load, perform normalization and @@ -25,13 +29,14 @@ def load_raw_data(assets, dates, data_query_time, data_query_tz, expr, `time`. expr : expr the expression representing the data to load. - odo_kwargs : dict, optional + odo_kwargs : dict extra keyword arguments to pass to odo when executing the expression. returns ------- raw : pd.dataframe - the data symbolized by `expr` materialized in a dataframe. + The result of computing expr and materializing the result as a + dataframe. """ lower_dt, upper_dt = normalize_data_query_bounds( dates[0], @@ -45,7 +50,7 @@ def load_raw_data(assets, dates, data_query_time, data_query_tz, expr, upper_dt, odo_kwargs, ) - sids = raw.loc[:, SID_FIELD_NAME] + sids = raw[SID_FIELD_NAME] raw.drop( sids[~sids.isin(assets)].index, inplace=True diff --git a/zipline/pipeline/loaders/quarter_estimates.py b/zipline/pipeline/loaders/quarter_estimates.py index 6ef8ae7c..54686a6e 100644 --- a/zipline/pipeline/loaders/quarter_estimates.py +++ b/zipline/pipeline/loaders/quarter_estimates.py @@ -1,9 +1,8 @@ from abc import abstractmethod -from collections import defaultdict import numpy as np -import pandas as pd from six import viewvalues from toolz import groupby + from zipline.lib.adjusted_array import AdjustedArray from zipline.lib.adjustment import (Datetime641DArrayOverwrite, Float641DArrayOverwrite) @@ -22,14 +21,15 @@ from zipline.pipeline.loaders.utils import ( last_in_date_group ) -NORMALIZED_QUARTERS = 'normalized_quarters' - -SHIFTED_NORMALIZED_QTRS = 'shifted_normalized_quarters' +INVALID_NUM_QTRS_MESSAGE = "Passed invalid number of quarters %s; " \ + "must pass a number of quarters >= 0" NEXT_FISCAL_QUARTER = 'next_fiscal_quarter' NEXT_FISCAL_YEAR = 'next_fiscal_year' +NORMALIZED_QUARTERS = 'normalized_quarters' PREVIOUS_FISCAL_QUARTER = 'previous_fiscal_quarter' PREVIOUS_FISCAL_YEAR = 'previous_fiscal_year' +SHIFTED_NORMALIZED_QTRS = 'shifted_normalized_quarters' SIMULTATION_DATES = 'dates' @@ -86,10 +86,10 @@ def validate_column_specs(events, columns): class QuarterEstimatesLoader(PipelineLoader): def __init__(self, estimates, - base_column_name_map): + name_map): validate_column_specs( estimates, - base_column_name_map + name_map ) self.estimates = estimates[ @@ -97,12 +97,16 @@ class QuarterEstimatesLoader(PipelineLoader): estimates[FISCAL_QUARTER_FIELD_NAME].notnull() & estimates[FISCAL_YEAR_FIELD_NAME].notnull() ] + self.estimates[NORMALIZED_QUARTERS] = normalize_quarters( + self.estimates[FISCAL_YEAR_FIELD_NAME], + self.estimates[FISCAL_QUARTER_FIELD_NAME], + ) - self.base_column_name_map = base_column_name_map + self.name_map = name_map @abstractmethod def load_quarters(self, num_quarters, last, dates): - pass + raise NotImplementedError('load_quarters') def get_requested_data_for_col(self, stacked_last_per_qtr, idx, dates): """ @@ -111,8 +115,8 @@ class QuarterEstimatesLoader(PipelineLoader): Parameters ---------- stacked_last_per_qtr : pd.DataFrame - The latest estimate known per sid per date per quarter with the - dates, normalized quarter, and sid as the index. + The latest estimate known with the dates, normalized quarter, and + sid as the index. idx : pd.MultiIndex The index of the row of the requested quarter from each date for each sid. @@ -122,16 +126,18 @@ class QuarterEstimatesLoader(PipelineLoader): Returns -------- requested_qtr_data : pd.DataFrame - The DataFrame with final values for the requested quarter for all - columns; `dates` are the index and columns are a MultiIndex with - sids at the top level and the dataset columns on the bottom. + The DataFrame with the latest values for the requested quarter + for all columns; `dates` are the index and columns are a MultiIndex + with sids at the top level and the dataset columns on the bottom. """ requested_qtr_data = stacked_last_per_qtr.loc[idx] - # We no longer need this in the index, but we do need it as a column - # to calculate adjustments. + # We no longer need the shifted normalized quarters in the index, but + # we do need it as a column to calculate adjustments. requested_qtr_data = requested_qtr_data.reset_index( SHIFTED_NORMALIZED_QTRS ) + # Calculate the actual year/quarter being requested and add those in + # as columns. (requested_qtr_data[FISCAL_YEAR_FIELD_NAME], requested_qtr_data[FISCAL_QUARTER_FIELD_NAME]) = \ split_normalized_quarters( @@ -154,8 +160,7 @@ class QuarterEstimatesLoader(PipelineLoader): column_name, column, mask, - assets, - qtr_crossover_point): + assets): """ Creates an AdjustedArray from the given estimates data for the given dates. @@ -183,18 +188,17 @@ class QuarterEstimatesLoader(PipelineLoader): computed. column : BoundColumn The column for which the AdjustedArray is being computed. - mask : - assets : - qtr_crossover_point : - Whether we should use the 'right' or 'left' side when doing - searchsorted on the dates for quarter boundaries. + mask : np.array + Mask array of dimensions len(dates) X len(assets). + assets : pd.Int64Index + An index of all the assets from the raw data. Returns ------- adjusted_array : AdjustedArray The array of data and overwrites for the given column. """ - adjustments = defaultdict(list) + adjustments = {} requested_qtr_data = self.get_requested_data_for_col( stacked_last_per_qtr, requested_qtr_idx, dates ) @@ -204,10 +208,8 @@ class QuarterEstimatesLoader(PipelineLoader): zero_qtr_data = zero_qtr_data.reset_index(NORMALIZED_QUARTERS) if column.dtype == datetime64ns_dtype: overwrite = Datetime641DArrayOverwrite - missing_value = np.datetime64('NaT', 'ns') else: overwrite = Float641DArrayOverwrite - missing_value = np.NaN for sid_idx, sid in enumerate(assets): zero_qtr_sid_data = zero_qtr_data[ zero_qtr_data.index.get_level_values(SID_FIELD_NAME) == sid @@ -225,7 +227,7 @@ class QuarterEstimatesLoader(PipelineLoader): ] # For the given sid, determine which quarters we have estimates # for. - quarters_with_estimates_for_sid = last_per_qtr.xs( + qtrs_with_estimates_for_sid = last_per_qtr.xs( sid, axis=1, level=SID_FIELD_NAME ).groupby(axis=1, level=1).first().columns.values for row_indexer in list(qtr_shifts.index): @@ -233,108 +235,162 @@ class QuarterEstimatesLoader(PipelineLoader): # after this row. This isn't the starting index of the # requested quarter, but simply the date we cross over into a # new quarter. - qtr_start_idx = dates.searchsorted( + next_qtr_start_idx = dates.searchsorted( zero_qtr_data.loc[ row_indexer ][EVENT_DATE_FIELD_NAME], - side=qtr_crossover_point + side='left' + if isinstance(self, PreviousQuartersEstimatesLoader) + else 'right' ) - - # Only add adjustments if the next quarter starts somewhere in - # our date index for this sid. Our 'next' quarter can never - # start at index 0; a starting index of 0 means that the next - # quarter's event date was NaT. - if 0 < qtr_start_idx < len(dates): - # Find the quarter being requested in the quarter we're - # crossing into. - requested_quarter = requested_qtr_data[ - SHIFTED_NORMALIZED_QTRS - ][sid].iloc[qtr_start_idx] - - # If there are estimates for the requested quarter, - # overwrite all values going up to the starting index of - # that quarter with estimates for that quarter. - if requested_quarter in quarters_with_estimates_for_sid: - adjustments[qtr_start_idx] = \ - [overwrite( - 0, - qtr_start_idx - 1, # overwrite thru last qtr - sid_idx, - sid_idx, - last_per_qtr[column_name, - requested_quarter, - sid][:qtr_start_idx].values)] - # There are no estimates for the quarter. Overwrite all - # values going up to the starting index of that quarter - # with the missing value for this column. - else: - adjustments[qtr_start_idx] = [ - overwrite( - 0, - qtr_start_idx - 1, - sid_idx, - sid_idx, - np.array( - [missing_value] * - len(last_per_qtr.index[:qtr_start_idx])) - ) - ] + adjustments[next_qtr_start_idx] = \ + self.create_overwrite_for_quarter( + next_qtr_start_idx, + column, + column_name, + dates, + last_per_qtr, + overwrite, + qtrs_with_estimates_for_sid, + requested_qtr_data, + sid, + sid_idx, + ) return AdjustedArray( - requested_qtr_data[column_name].values.astype(column.dtype), - mask, - dict(adjustments), - column.missing_value, - ) + requested_qtr_data[column_name].values.astype(column.dtype), + mask, + dict(adjustments), + column.missing_value, + ) + + def create_overwrite_for_quarter(self, + next_qtr_start_idx, + column, + column_name, + dates, + last_per_qtr, + overwrite, + quarters_with_estimates_for_sid, + requested_qtr_data, + sid, + sid_idx): + # Only add adjustments if the next quarter starts somewhere in + # our date index for this sid. Our 'next' quarter can never + # start at index 0; a starting index of 0 means that the next + # quarter's event date was NaT. + if 0 < next_qtr_start_idx < len(dates): + # Find the quarter being requested in the quarter we're + # crossing into. + requested_quarter = requested_qtr_data[ + SHIFTED_NORMALIZED_QTRS + ][sid].iloc[next_qtr_start_idx] + + # If there are estimates for the requested quarter, + # overwrite all values going up to the starting index of + # that quarter with estimates for that quarter. + if requested_quarter in quarters_with_estimates_for_sid: + return self.create_overwrite_for_estimate( + column, + column_name, + last_per_qtr, + next_qtr_start_idx, + overwrite, + requested_quarter, + sid, + sid_idx + ) + # There are no estimates for the quarter. Overwrite all + # values going up to the starting index of that quarter + # with the missing value for this column. + else: + return self.overwrite_with_null( + column, + last_per_qtr, + next_qtr_start_idx, + overwrite, + sid_idx + ) + + def overwrite_with_null(self, + column, + last_per_qtr, + next_qtr_start_idx, + overwrite, + sid_idx): + return [overwrite( + 0, + next_qtr_start_idx - 1, + sid_idx, + sid_idx, + np.full( + len( + last_per_qtr.index[:next_qtr_start_idx] + ), + column.missing_value, + dtype=column.dtype + ))] def load_adjusted_array(self, columns, dates, assets, mask): - # TODO: how can we enforce that datasets have the num_quarters - # attribute, given that they're created dynamically? - groups = groupby(lambda x: x.dataset.num_quarters, columns) - groups_columns = dict(groups) - if (pd.Series(groups_columns.keys()) < 0).any(): - raise ValueError("Must pass a number of quarters >= 0") + # Separate out getting the columns' datasets and the datasets' + # num_quarters attributes to ensure that we're catching the right + # AttributeError. + col_to_datasets = {col: col.dataset for col in columns} + try: + groups = groupby(lambda col: col_to_datasets[col].num_quarters, + col_to_datasets) + except AttributeError: + raise AttributeError("Datasets loaded via the " + "QuarterEstimatesLoader must define a " + "`num_quarters` attribute that defines how " + "many quarters out the loader should load " + "the data relative to `dates`.") + if any(num_qtr < 0 for num_qtr in groups): + raise ValueError( + INVALID_NUM_QTRS_MESSAGE % ','.join( + str(qtr) for qtr in groups if qtr < 0 + ) + + ) out = {} - self.estimates[NORMALIZED_QUARTERS] = normalize_quarters( - self.estimates[FISCAL_YEAR_FIELD_NAME], - self.estimates[FISCAL_QUARTER_FIELD_NAME], - ) - for num_quarters, columns in groups_columns.items(): - # The column's dataset is itself dynamic and the mapping we - # actually want is to its dataset's parent's column name. - name_map = {c: self.base_column_name_map[ - getattr(c.dataset.__base__, c.name) - ] for c in columns} + + for num_quarters, columns in groups.items(): # Determine the last piece of information we know for each column # on each date in the index for each sid and quarter. last_per_qtr = last_in_date_group( - self.estimates, True, dates, assets, + self.estimates, dates, assets, reindex=True, extra_groupers=[NORMALIZED_QUARTERS] ) # Forward fill values for each quarter/sid/dataset column. - ffill_across_cols(last_per_qtr, columns) + ffill_across_cols(last_per_qtr, columns, self.name_map) # Stack quarter and sid into the index. - stacked_last_per_qtr = last_per_qtr.stack([NORMALIZED_QUARTERS, - SID_FIELD_NAME]) + stacked_last_per_qtr = last_per_qtr.stack([SID_FIELD_NAME, + NORMALIZED_QUARTERS]) # Set date index name for ease of reference - stacked_last_per_qtr.index.set_names(SIMULTATION_DATES, 0, True) + stacked_last_per_qtr.index.set_names(SIMULTATION_DATES, + level=0, + inplace=True) + # We want to know the most recent/next event relative to each date. + stacked_last_per_qtr = stacked_last_per_qtr.sort( + EVENT_DATE_FIELD_NAME + ) # Determine which quarter is next/previous for each date. shifted_qtr_data = self.load_quarters(num_quarters, stacked_last_per_qtr) zero_qtr_idx = shifted_qtr_data.index requested_qtr_idx = shifted_qtr_data.set_index([ - shifted_qtr_data.index.get_level_values( - SIMULTATION_DATES - ), - shifted_qtr_data[SHIFTED_NORMALIZED_QTRS], - shifted_qtr_data.index.get_level_values( - SID_FIELD_NAME - )] - ).index + shifted_qtr_data.index.get_level_values( + SIMULTATION_DATES + ), + shifted_qtr_data.index.get_level_values( + SID_FIELD_NAME + ), + shifted_qtr_data[SHIFTED_NORMALIZED_QTRS] + ]).index for c in columns: - column_name = name_map[c] + column_name = self.name_map[c.name] adjusted_array = self.get_adjustments(zero_qtr_idx, requested_qtr_idx, stacked_last_per_qtr, @@ -343,26 +399,68 @@ class QuarterEstimatesLoader(PipelineLoader): column_name, c, mask, - assets, - self.qtr_crossover_point) + assets) out[c] = adjusted_array return out class NextQuartersEstimatesLoader(QuarterEstimatesLoader): - qtr_crossover_point = 'right' + def create_overwrite_for_estimate(self, + column, + column_name, + last_per_qtr, + next_qtr_start_idx, + overwrite, + requested_quarter, + sid, + sid_idx): + return [overwrite( + 0, + # overwrite thru last qtr + next_qtr_start_idx - 1, + sid_idx, + sid_idx, + last_per_qtr[ + column_name, + requested_quarter, + sid + ][0:next_qtr_start_idx].values)] def load_quarters(self, num_quarters, stacked_last_per_qtr): - # Filter for releases that are on or after each simulation date and - # determine the next quarter by picking out the upcoming release for - # each date in the index. - stacked_last_per_qtr = stacked_last_per_qtr.sort( - EVENT_DATE_FIELD_NAME - ) + """ + Filters for releases that are on or after each simulation date and + determines the next quarter by picking out the upcoming release for + each date in the index. Adda a SHIFTED_NORMALIZED_QTRS column which + contains the requested next quarter for each calendar date and sid. + + Parameters + ---------- + num_quarters : int + Number of quarters to go out in the future. + stacked_last_per_qtr : pd.DataFrame + A DataFrame with index of calendar dates, sid, and normalized + quarters with each row being the latest estimate for the row's + index values, sorted by event date. + + Returns + ------- + next_releases_per_date : pd.DataFrame + A DataFrame with index of calendar dates, sid, and normalized + quarters, keeping only rows with next event information relative to + the index values and with an added column for + SHIFTED_NORMALIZED_QTRS, which contains the requested quarter for + each row. + """ + + # We reset the index here because in pandas3, a groupby on the index + # will set the index to just the items in the groupby, so we will lose + # the normalized quarters. next_releases_per_date = stacked_last_per_qtr.loc[ stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] >= stacked_last_per_qtr.index.get_level_values(SIMULTATION_DATES) - ].groupby(level=[SIMULTATION_DATES, SID_FIELD_NAME]).nth(0) + ].reset_index(NORMALIZED_QUARTERS).groupby( + level=[SIMULTATION_DATES, SID_FIELD_NAME] + ).nth(0).set_index(NORMALIZED_QUARTERS, append=True) next_releases_per_date[ SHIFTED_NORMALIZED_QTRS ] = next_releases_per_date.index.get_level_values( @@ -372,18 +470,57 @@ class NextQuartersEstimatesLoader(QuarterEstimatesLoader): class PreviousQuartersEstimatesLoader(QuarterEstimatesLoader): - qtr_crossover_point = 'left' + def create_overwrite_for_estimate(self, + column, + column_name, + last_per_qtr, + next_qtr_start_idx, + overwrite, + requested_quarter, + sid, + sid_idx): + return self.overwrite_with_null(column, + last_per_qtr, + next_qtr_start_idx, + overwrite, + sid_idx) def load_quarters(self, num_quarters, stacked_last_per_qtr): - # Filter for releases that are on or before each simulation date and - # determine the previous quarter by picking out the upcoming release - # for each date in the index. - stacked_last_per_qtr = stacked_last_per_qtr.sort(EVENT_DATE_FIELD_NAME) + """ + Filters for releases that are on or after each simulation date and + determines the previous quarter by picking out the most recent + release relative to each date in the index. Adds a + SHIFTED_NORMALIZED_QTRS column which contains the requested previous + quarter for each calendar date and sid. + + Parameters + ---------- + num_quarters : int + Number of quarters to go out in the past. + stacked_last_per_qtr : pd.DataFrame + A DataFrame with index of calendar dates, sid, and normalized + quarters with each row being the latest estimate for the row's + index values, sorted by event date. + + Returns + ------- + next_releases_per_date : pd.DataFrame + A DataFrame with index of calendar dates, sid, and normalized + quarters, keeping only rows with have a previous event relative + to the index values and with an added column for + SHIFTED_NORMALIZED_QTRS, which contains the requested quarter for + each row. + """ + + # We reset the index here because in pandas3, a groupby on the index + # will set the index to just the items in the groupby, so we will lose + # the normalized quarters. previous_releases_per_date = stacked_last_per_qtr.loc[ stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] <= - stacked_last_per_qtr.index.get_level_values( - SIMULTATION_DATES - )].groupby(level=[SIMULTATION_DATES, SID_FIELD_NAME]).nth(-1) + stacked_last_per_qtr.index.get_level_values(SIMULTATION_DATES) + ].reset_index(NORMALIZED_QUARTERS).groupby( + level=[SIMULTATION_DATES, SID_FIELD_NAME] + ).nth(-1).set_index(NORMALIZED_QUARTERS, append=True) previous_releases_per_date[ SHIFTED_NORMALIZED_QTRS ] = previous_releases_per_date.index.get_level_values( diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 5bd91132..125dacfb 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -276,7 +276,7 @@ def check_data_query_args(data_query_time, data_query_tz): ) -def last_in_date_group(df, reindex, dates, assets, have_sids=True, +def last_in_date_group(df, dates, assets, reindex=True, have_sids=True, extra_groupers=[]): """ Determine the last piece of information known on each date in the date @@ -286,14 +286,14 @@ def last_in_date_group(df, reindex, dates, assets, have_sids=True, ---------- df : pd.DataFrame The DataFrame containing the data to be grouped. - reindex : bool - Whether or not the DataFrame should be reindexed against the date - index. This will add back any dates to the index that were grouped - away. dates : pd.DatetimeIndex The dates to use for grouping and reindexing. assets : pd.Int64Index The assets that should be included in the column multiindex. + reindex : bool + Whether or not the DataFrame should be reindexed against the date + index. This will add back any dates to the index that were grouped + away. have_sids : bool Whether or not the DataFrame has sids. If it does, they will be used in the groupby. @@ -307,11 +307,11 @@ def last_in_date_group(df, reindex, dates, assets, have_sids=True, levels of a multiindex of columns. """ - idx = dates[dates.searchsorted( + idx = [dates[dates.searchsorted( df[TS_FIELD_NAME].values.astype('datetime64[D]') - )] + )]] if have_sids: - idx = [idx, SID_FIELD_NAME] + idx += [SID_FIELD_NAME] idx += extra_groupers last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby( @@ -321,7 +321,7 @@ def last_in_date_group(df, reindex, dates, assets, have_sids=True, # For the number of things that we're grouping by (except TS), unstack # the df - last_in_group = last_in_group.unstack([-1, -2]) + last_in_group = last_in_group.unstack(list(range(-1, -len(idx), -1))) if reindex: if have_sids: @@ -339,7 +339,7 @@ def last_in_date_group(df, reindex, dates, assets, have_sids=True, return last_in_group -def ffill_across_cols(df, columns): +def ffill_across_cols(df, columns, name_map): """ Forward fill values in a DataFrame with special logic to handle cases that pd.DataFrame.ffill cannot and cast columns to appropriate types. @@ -351,6 +351,9 @@ def ffill_across_cols(df, columns): columns : list of BoundColumn The BoundColumns that correspond to columns in the DataFrame to which special filling and/or casting logic should be applied. + name_map: map of string -> string + Mapping from the name of each BoundColumn to the associated column + name in `df`. """ df.ffill(inplace=True) @@ -369,18 +372,19 @@ def ffill_across_cols(df, columns): # pandas to replace NaNs in an object column with None using fillna, # so we have to roll our own instead using df.where. for column in columns: + column_name = name_map[column.name] # Special logic for strings since `fillna` doesn't work if the # missing value is `None`. if column.dtype == categorical_dtype: - df[column.name] = df[ + df[column_name] = df[ column.name - ].where(pd.notnull(df[column.name]), + ].where(pd.notnull(df[column_name]), column.missing_value) else: # We need to execute `fillna` before `astype` in case the # column contains NaNs and needs to be cast to bool or int. # This is so that the NaNs are replaced first, since pandas # can't convert NaNs for those types. - df[column.name] = df[ - column.name + df[column_name] = df[ + column_name ].fillna(column.missing_value).astype(column.dtype) diff --git a/zipline/testing/core.py b/zipline/testing/core.py index f9832767..19240a5b 100644 --- a/zipline/testing/core.py +++ b/zipline/testing/core.py @@ -49,8 +49,14 @@ from zipline.pipeline.loaders.testing import make_seeded_random_loader from zipline.utils import security_list from zipline.utils.calendars import get_calendar from zipline.utils.input_validation import expect_dimensions +<<<<<<< HEAD from zipline.utils.numpy_utils import as_column, isnat from zipline.utils.pandas_utils import timedelta_to_integral_seconds +======= +from zipline.utils.numpy_utils import ( + as_column, +) +>>>>>>> WIP from zipline.utils.sentinel import sentinel import numpy as np diff --git a/zipline/testing/fixtures.py b/zipline/testing/fixtures.py index f0e2aaa5..1ce01627 100644 --- a/zipline/testing/fixtures.py +++ b/zipline/testing/fixtures.py @@ -34,13 +34,14 @@ from ..finance.trading import TradingEnvironment from ..utils import factory from ..utils.classproperty import classproperty from ..utils.final import FinalMeta, final -from .core import tmp_asset_finder, make_simple_equity_info +from .core import (tmp_asset_finder, make_simple_equity_info) from zipline.assets import Equity, Future from zipline.pipeline import SimplePipelineEngine from zipline.pipeline.loaders.testing import make_seeded_random_loader from zipline.utils.calendars import ( get_calendar, - register_calendar) + register_calendar +) class ZiplineTestCase(with_metaclass(FinalMeta, TestCase)): From 90984be20cff37415addb95362d23e1daf87793a Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Tue, 13 Sep 2016 10:53:00 -0400 Subject: [PATCH 11/16] TST: add test for missing num_quarters and clean up tests --- tests/pipeline/test_quarters_estimates.py | 209 ++++++++++++++-------- 1 file changed, 130 insertions(+), 79 deletions(-) diff --git a/tests/pipeline/test_quarters_estimates.py b/tests/pipeline/test_quarters_estimates.py index 9217df77..56e7827b 100644 --- a/tests/pipeline/test_quarters_estimates.py +++ b/tests/pipeline/test_quarters_estimates.py @@ -4,7 +4,7 @@ from nose.tools import assert_true from nose_parameterized import parameterized import numpy as np import pandas as pd -from pandas.util.testing import assert_frame_equal +from pandas.util.testing import assert_frame_equal, assert_series_equal from toolz import merge from zipline.pipeline import SimplePipelineEngine, Pipeline, CustomFactor @@ -22,6 +22,7 @@ from zipline.pipeline.loaders.blaze.estimates import ( BlazePreviousEstimatesLoader ) from zipline.pipeline.loaders.quarter_estimates import ( + INVALID_NUM_QTRS_MESSAGE, NextQuartersEstimatesLoader, normalize_quarters, PreviousQuartersEstimatesLoader, @@ -32,7 +33,7 @@ from zipline.testing.fixtures import ( WithTradingSessions, ZiplineTestCase, ) -from zipline.testing.predicates import assert_equal +from zipline.testing.predicates import assert_equal, assert_raises_regex from zipline.utils.numpy_utils import datetime64ns_dtype from zipline.utils.numpy_utils import float64_dtype @@ -51,6 +52,12 @@ def QuartersEstimates(num_qtr): return QtrEstimates +def QuartersEstimatesNoNumQuartersAttr(num_qtr): + class QtrEstimates(Estimates): + name = Estimates + return QtrEstimates + + class WithEstimates(WithTradingSessions, WithAssetFinder): """ ZiplineTestCase mixin providing cls.loader and cls.events as class @@ -77,8 +84,13 @@ class WithEstimates(WithTradingSessions, WithAssetFinder): def make_loader(cls, events, columns): raise NotImplementedError('make_loader') + @classmethod + def make_events(cls): + raise NotImplementedError('make_events') + @classmethod def init_class_fixtures(cls): + cls.events = cls.make_events() cls.sids = cls.events[SID_FIELD_NAME].unique() cls.columns = { Estimates.event_date: 'event_date', @@ -98,7 +110,7 @@ class WithEstimates(WithTradingSessions, WithAssetFinder): super(WithEstimates, cls).init_class_fixtures() -class WithWrongNumQuarters(WithEstimates): +class WithWrongLoaderDefinition(WithEstimates): """ ZiplineTestCase mixin providing cls.events as a class level fixture and defining a test for all inheritors to use. @@ -113,18 +125,48 @@ class WithWrongNumQuarters(WithEstimates): ------ test_wrong_num_quarters_passed() Tests that loading with an incorrect quarter number raises an error. + test_no_num_quarters_attr() + Tests that the loader throws an AssertionError if the dataset being + loaded has no `num_quarters` attribute. """ - events = pd.DataFrame({SID_FIELD_NAME: 0}, - columns=[SID_FIELD_NAME, - TS_FIELD_NAME, - EVENT_DATE_FIELD_NAME, - FISCAL_QUARTER_FIELD_NAME, - FISCAL_YEAR_FIELD_NAME, - 'estimate'], - index=[0]) + + @classmethod + def make_events(cls): + return pd.DataFrame({SID_FIELD_NAME: 0}, + columns=[SID_FIELD_NAME, + TS_FIELD_NAME, + EVENT_DATE_FIELD_NAME, + FISCAL_QUARTER_FIELD_NAME, + FISCAL_YEAR_FIELD_NAME, + 'estimate'], + index=[0]) def test_wrong_num_quarters_passed(self): - dataset = QuartersEstimates(-1) + bad_dataset1 = QuartersEstimates(-1) + bad_dataset2 = QuartersEstimates(-2) + good_dataset = QuartersEstimates(1) + engine = SimplePipelineEngine( + lambda x: self.loader, + self.trading_days, + self.asset_finder, + ) + columns = {c.name + str(dataset.num_quarters): c.latest + for dataset in (bad_dataset1, + bad_dataset2, + good_dataset) + for c in dataset.columns} + p = Pipeline(columns) + + with self.assertRaises(ValueError) as e: + engine.run_pipeline( + p, + start_date=self.trading_days[0], + end_date=self.trading_days[-1], + ) + assert_raises_regex(e, INVALID_NUM_QTRS_MESSAGE % "-1,-2") + + def test_no_num_quarters_attr(self): + dataset = QuartersEstimatesNoNumQuartersAttr(1) engine = SimplePipelineEngine( lambda x: self.loader, self.trading_days, @@ -132,7 +174,7 @@ class WithWrongNumQuarters(WithEstimates): ) p = Pipeline({c.name: c.latest for c in dataset.columns}) - with self.assertRaises(ValueError): + with self.assertRaises(AttributeError): engine.run_pipeline( p, start_date=self.trading_days[0], @@ -140,7 +182,7 @@ class WithWrongNumQuarters(WithEstimates): ) -class PreviousWithWrongNumQuarters(WithWrongNumQuarters, +class PreviousWithWrongNumQuarters(WithWrongLoaderDefinition, ZiplineTestCase): """ Tests that previous quarter loader correctly breaks if an incorrect @@ -151,7 +193,7 @@ class PreviousWithWrongNumQuarters(WithWrongNumQuarters, return PreviousQuartersEstimatesLoader(events, columns) -class NextWithWrongNumQuarters(WithWrongNumQuarters, +class NextWithWrongNumQuarters(WithWrongLoaderDefinition, ZiplineTestCase): """ Tests that next quarter loader correctly breaks if an incorrect @@ -162,7 +204,7 @@ class NextWithWrongNumQuarters(WithWrongNumQuarters, return NextQuartersEstimatesLoader(events, columns) -class WithEstimatesT0(WithEstimates): +class WithEstimatesTimeZero(WithEstimates): """ ZiplineTestCase mixin providing cls.events as a class level fixture and defining a test for all inheritors to use. @@ -173,9 +215,9 @@ class WithEstimatesT0(WithEstimates): Generated dynamically in order to test inter-leavings of estimates and event dates for multiple quarters to make sure that we select the right immediate 'next' or 'previous' quarter relative to each date - - i.e., the right 't0' on the timeline. We care about selecting the - right 't0' because we use that to calculate which quarter's data needs - to be returned for each day. + i.e., the right 'time zero' on the timeline. We care about selecting + the right 'time zero' because we use that to calculate which quarter's + data needs to be returned for each day. Methods ------- @@ -189,8 +231,8 @@ class WithEstimatesT0(WithEstimates): Tests ------ test_estimates() - Tests that we get the right 't0' value on each day for each sid and - for each column. + Tests that we get the right 'time zero' value on each day for each + sid and for each column. """ q1_knowledge_dates = [pd.Timestamp('2015-01-01'), pd.Timestamp('2015-01-04'), @@ -212,7 +254,7 @@ class WithEstimatesT0(WithEstimates): pd.Timestamp('2015-01-31')] @classmethod - def gen_estimates(cls): + def make_events(cls): """ In order to determine which estimate we care about for a particular sid, we need to look at all estimates that we have for that sid and @@ -296,8 +338,8 @@ class WithEstimatesT0(WithEstimates): @classmethod def init_class_fixtures(cls): # Must be generated before call to super since super uses `events`. - cls.events = cls.gen_estimates() - super(WithEstimatesT0, cls).init_class_fixtures() + cls.events = cls.make_events() + super(WithEstimatesTimeZero, cls).init_class_fixtures() def get_expected_estimate(self, q1_knowledge, @@ -344,14 +386,17 @@ class WithEstimatesT0(WithEstimates): q2_knowledge, comparable_date, ) - if not expected_estimate.empty: - for colname in sid_estimates.columns: - expected_value = expected_estimate[colname] - computed_value = sid_estimates.iloc[i][colname] - assert_equal(expected_value, computed_value) + # Have to explicitly check for None because + # `expected_estimate` might be a DataFrame. + if expected_estimate is not None: + assert_series_equal( + sid_estimates.iloc[i], + expected_estimate[sid_estimates.columns], + check_names=False + ) else: - # There are no eligible 'next' estimates on this day; - # everything should be null. + # There are no eligible 'next'/'previous' estimates on + # this day; everything should be null. assert_true(sid_estimates.iloc[i].isnull().all()) else: # We don't know about any estimates on this day; @@ -359,7 +404,7 @@ class WithEstimatesT0(WithEstimates): assert_true(sid_estimates.iloc[i].isnull().all()) -class NextEstimate(WithEstimatesT0, ZiplineTestCase): +class NextEstimate(WithEstimatesTimeZero, ZiplineTestCase): @classmethod def make_loader(cls, events, columns): return NextQuartersEstimatesLoader(events, columns) @@ -372,17 +417,17 @@ class NextEstimate(WithEstimatesT0, ZiplineTestCase): # happening on this simulation date or later, then that's # the estimate we want to use. if (not q1_knowledge.empty and - q1_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] >= + q1_knowledge[EVENT_DATE_FIELD_NAME].iloc[-1] >= comparable_date): return q1_knowledge.iloc[-1] # If q1 has already happened or we don't know about it # yet and our latest knowledge indicates that q2 hasn't # happened yet, then that's the estimate we want to use. elif (not q2_knowledge.empty and - q2_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] >= + q2_knowledge[EVENT_DATE_FIELD_NAME].iloc[-1] >= comparable_date): return q2_knowledge.iloc[-1] - return pd.DataFrame() + return None class BlazeNextEstimateLoaderTestCase(NextEstimate): @@ -398,7 +443,7 @@ class BlazeNextEstimateLoaderTestCase(NextEstimate): ) -class PreviousEstimate(WithEstimatesT0, ZiplineTestCase): +class PreviousEstimate(WithEstimatesTimeZero, ZiplineTestCase): @classmethod def make_loader(cls, events, columns): return PreviousQuartersEstimatesLoader(events, columns) @@ -413,14 +458,14 @@ class PreviousEstimate(WithEstimatesT0, ZiplineTestCase): # Otherwise, it'll be for q1, as long as the release date # for q1 has already happened. if (not q2_knowledge.empty and - q2_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] <= + q2_knowledge[EVENT_DATE_FIELD_NAME].iloc[-1] <= comparable_date): return q2_knowledge.iloc[-1] elif (not q1_knowledge.empty and - q1_knowledge.iloc[-1][EVENT_DATE_FIELD_NAME] <= + q1_knowledge[EVENT_DATE_FIELD_NAME].iloc[-1] <= comparable_date): return q1_knowledge.iloc[-1] - return pd.DataFrame() + return None class BlazePreviousEstimateLoaderTestCase(PreviousEstimate): @@ -461,16 +506,19 @@ class WithEstimateMultipleQuarters(WithEstimates): out and checks that the returned columns contain data for the correct number of quarters out. """ - events = pd.DataFrame({ - SID_FIELD_NAME: [0] * 2, - TS_FIELD_NAME: [pd.Timestamp('2015-01-01'), - pd.Timestamp('2015-01-06')], - EVENT_DATE_FIELD_NAME: [pd.Timestamp('2015-01-10'), - pd.Timestamp('2015-01-20')], - 'estimate': [1., 2.], - FISCAL_QUARTER_FIELD_NAME: [1, 2], - FISCAL_YEAR_FIELD_NAME: [2015, 2015] - }) + + @classmethod + def make_events(cls): + return pd.DataFrame({ + SID_FIELD_NAME: [0] * 2, + TS_FIELD_NAME: [pd.Timestamp('2015-01-01'), + pd.Timestamp('2015-01-06')], + EVENT_DATE_FIELD_NAME: [pd.Timestamp('2015-01-10'), + pd.Timestamp('2015-01-20')], + 'estimate': [1., 2.], + FISCAL_QUARTER_FIELD_NAME: [1, 2], + FISCAL_YEAR_FIELD_NAME: [2015, 2015] + }) @classmethod def init_class_fixtures(cls): @@ -635,35 +683,6 @@ class WithEstimateWindows(WithEstimates): Tests that we overwrite values with the correct quarter's estimate at the correct dates when we have a factor that asks for a window of data. """ - sid_0_timeline = pd.DataFrame({ - TS_FIELD_NAME: [pd.Timestamp('2015-01-05'), - pd.Timestamp('2015-01-07'), - pd.Timestamp('2015-01-05'), - pd.Timestamp('2015-01-17')], - EVENT_DATE_FIELD_NAME: - [pd.Timestamp('2015-01-10'), - pd.Timestamp('2015-01-10'), - pd.Timestamp('2015-01-20'), - pd.Timestamp('2015-01-20')], - 'estimate': [10., 11.] + [20., 21.], - FISCAL_QUARTER_FIELD_NAME: [1] * 2 + [2] * 2, - FISCAL_YEAR_FIELD_NAME: 2015, - SID_FIELD_NAME: 0, - }) - - sid_1_timeline = pd.DataFrame({ - TS_FIELD_NAME: [pd.Timestamp('2015-01-09'), - pd.Timestamp('2015-01-12'), - pd.Timestamp('2015-01-09'), - pd.Timestamp('2015-01-15')], - EVENT_DATE_FIELD_NAME: - [pd.Timestamp('2015-01-12'), pd.Timestamp('2015-01-12'), - pd.Timestamp('2015-01-15'), pd.Timestamp('2015-01-15')], - 'estimate': [10., 11.] + [30., 31.], - FISCAL_QUARTER_FIELD_NAME: [1] * 2 + [3] * 2, - FISCAL_YEAR_FIELD_NAME: 2015, - SID_FIELD_NAME: 1 - }) window_test_start_date = pd.Timestamp('2015-01-05') critical_dates = [pd.Timestamp('2015-01-09', tz='utc'), @@ -673,7 +692,39 @@ class WithEstimateWindows(WithEstimates): # window length, starting date, num quarters out, timeline. Parameterizes # over number of quarters out. window_test_cases = list(itertools.product(critical_dates, (1, 2))) - events = pd.concat([sid_0_timeline, sid_1_timeline]) + + @classmethod + def make_events(cls): + sid_0_timeline = pd.DataFrame({ + TS_FIELD_NAME: [pd.Timestamp('2015-01-05'), + pd.Timestamp('2015-01-07'), + pd.Timestamp('2015-01-05'), + pd.Timestamp('2015-01-17')], + EVENT_DATE_FIELD_NAME: + [pd.Timestamp('2015-01-10'), + pd.Timestamp('2015-01-10'), + pd.Timestamp('2015-01-20'), + pd.Timestamp('2015-01-20')], + 'estimate': [10., 11.] + [20., 21.], + FISCAL_QUARTER_FIELD_NAME: [1] * 2 + [2] * 2, + FISCAL_YEAR_FIELD_NAME: 2015, + SID_FIELD_NAME: 0, + }) + + sid_1_timeline = pd.DataFrame({ + TS_FIELD_NAME: [pd.Timestamp('2015-01-09'), + pd.Timestamp('2015-01-12'), + pd.Timestamp('2015-01-09'), + pd.Timestamp('2015-01-15')], + EVENT_DATE_FIELD_NAME: + [pd.Timestamp('2015-01-12'), pd.Timestamp('2015-01-12'), + pd.Timestamp('2015-01-15'), pd.Timestamp('2015-01-15')], + 'estimate': [10., 11.] + [30., 31.], + FISCAL_QUARTER_FIELD_NAME: [1] * 2 + [3] * 2, + FISCAL_YEAR_FIELD_NAME: 2015, + SID_FIELD_NAME: 1 + }) + return pd.concat([sid_0_timeline, sid_1_timeline]) @classmethod def make_expected_timelines(cls): From f140cdb8d610662101b0f3a2c9a3b97986c7c8b5 Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Tue, 13 Sep 2016 15:52:32 -0400 Subject: [PATCH 12/16] MAINT: add checkpointing BUG: add cols for sids with no data and get adjustments outside column loop --- zipline/pipeline/loaders/blaze/core.py | 72 ++++---- zipline/pipeline/loaders/blaze/estimates.py | 7 +- zipline/pipeline/loaders/blaze/utils.py | 6 +- zipline/pipeline/loaders/quarter_estimates.py | 172 +++++++++--------- 4 files changed, 134 insertions(+), 123 deletions(-) diff --git a/zipline/pipeline/loaders/blaze/core.py b/zipline/pipeline/loaders/blaze/core.py index 5264a954..61782c31 100644 --- a/zipline/pipeline/loaders/blaze/core.py +++ b/zipline/pipeline/loaders/blaze/core.py @@ -1028,22 +1028,10 @@ class BlazeLoader(dict): return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) - if checkpoints is not None: - ts = checkpoints[TS_FIELD_NAME] - checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp) - if pd.isnull(checkpoints_ts): - materialized_checkpoints = pd.DataFrame(columns=colnames) - lower = None - else: - materialized_checkpoints = odo( - checkpoints[ts == checkpoints_ts][colnames], - pd.DataFrame, - **odo_kwargs - ) - lower = checkpoints_ts - else: - materialized_checkpoints = pd.DataFrame(columns=colnames) - lower = None + + lower, materialized_checkpoints = get_materialized_checkpoints( + checkpoints, colnames, lower_dt, odo_kwargs + ) materialized_expr = self.pool.apply_async(collect_expr, (expr, lower)) materialized_deltas = ( @@ -1137,6 +1125,7 @@ class BlazeLoader(dict): for column_idx, column in enumerate(columns) } + global_loader = BlazeLoader.global_instance() @@ -1168,12 +1157,32 @@ def bind_expression_to_resources(expr, resources): }) +def get_materialized_checkpoints(checkpoints, colnames, lower_dt, odo_kwargs): + if checkpoints is not None: + ts = checkpoints[TS_FIELD_NAME] + checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp) + if pd.isnull(checkpoints_ts): + materialized_checkpoints = pd.DataFrame(columns=colnames) + lower = None + else: + materialized_checkpoints = odo( + checkpoints[ts == checkpoints_ts][colnames], + pd.DataFrame, + **odo_kwargs + ) + lower = checkpoints_ts + else: + materialized_checkpoints = pd.DataFrame(columns=colnames) + lower = None + return lower, materialized_checkpoints + + def ffill_query_in_range(expr, lower, upper, + checkpoints=None, odo_kwargs=None, - ts_field=TS_FIELD_NAME, - sid_field=SID_FIELD_NAME): + ts_field=TS_FIELD_NAME): """Query a blaze expression in a given time range properly forward filling from values that fall before the lower date. @@ -1199,27 +1208,24 @@ def ffill_query_in_range(expr, start before the requested start date if a value is needed to ffill. """ odo_kwargs = odo_kwargs or {} - filtered = expr[expr[ts_field] <= lower] - computed_lower = odo( - bz.by( - filtered[sid_field], - timestamp=filtered[ts_field].max(), - ).timestamp.min(), - pd.Timestamp, - **odo_kwargs + computed_lower, materialized_checkpoints = get_materialized_checkpoints( + checkpoints, expr.fields, lower, odo_kwargs ) if pd.isnull(computed_lower): # If there is no lower date, just query for data in the date # range. It must all be null anyways. computed_lower = lower - raw = odo( - expr[ - (expr[ts_field] >= computed_lower) & - (expr[ts_field] <= upper) - ], - pd.DataFrame, - **odo_kwargs + raw = pd.concat( + [materialized_checkpoints, + odo( + expr[ + (expr[ts_field] >= computed_lower) & + (expr[ts_field] <= upper) + ], + pd.DataFrame, + **odo_kwargs + )] ) raw.loc[:, ts_field] = raw.loc[:, ts_field].astype('datetime64[ns]') return raw diff --git a/zipline/pipeline/loaders/blaze/estimates.py b/zipline/pipeline/loaders/blaze/estimates.py index 864b05a7..b991c62c 100644 --- a/zipline/pipeline/loaders/blaze/estimates.py +++ b/zipline/pipeline/loaders/blaze/estimates.py @@ -63,7 +63,8 @@ class BlazeEstimatesLoader(PipelineLoader): resources=None, odo_kwargs=None, data_query_time=None, - data_query_tz=None): + data_query_tz=None, + checkpoints=None): dshape = expr.dshape if not istabular(dshape): @@ -83,6 +84,7 @@ class BlazeEstimatesLoader(PipelineLoader): check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz + self._checkpoints = checkpoints def load_adjusted_array(self, columns, dates, assets, mask): raw = load_raw_data(assets, @@ -90,7 +92,8 @@ class BlazeEstimatesLoader(PipelineLoader): self._data_query_time, self._data_query_tz, self._expr, - self._odo_kwargs) + self._odo_kwargs, + checkpoints=self._checkpoints) return self.loader( raw, diff --git a/zipline/pipeline/loaders/blaze/utils.py b/zipline/pipeline/loaders/blaze/utils.py index 963d9f9a..b5be9cd9 100644 --- a/zipline/pipeline/loaders/blaze/utils.py +++ b/zipline/pipeline/loaders/blaze/utils.py @@ -11,7 +11,8 @@ def load_raw_data(assets, data_query_time, data_query_tz, expr, - odo_kwargs): + odo_kwargs, + checkpoints=None): """ given an expression representing data to load, perform normalization and forward-filling and return the data, materialized. @@ -48,7 +49,8 @@ def load_raw_data(assets, expr, lower_dt, upper_dt, - odo_kwargs, + checkpoints=checkpoints, + odo_kwargs=odo_kwargs, ) sids = raw[SID_FIELD_NAME] raw.drop( diff --git a/zipline/pipeline/loaders/quarter_estimates.py b/zipline/pipeline/loaders/quarter_estimates.py index 54686a6e..71ff3a32 100644 --- a/zipline/pipeline/loaders/quarter_estimates.py +++ b/zipline/pipeline/loaders/quarter_estimates.py @@ -1,5 +1,7 @@ +from collections import defaultdict from abc import abstractmethod import numpy as np +import pandas as pd from six import viewvalues from toolz import groupby @@ -152,30 +154,22 @@ class QuarterEstimatesLoader(PipelineLoader): return requested_qtr_data def get_adjustments(self, - zero_qtr_idx, - requested_qtr_idx, - stacked_last_per_qtr, + zero_qtr_data, + requested_qtr_data, last_per_qtr, dates, - column_name, - column, - mask, - assets): + assets, + columns): """ Creates an AdjustedArray from the given estimates data for the given dates. Parameters ---------- - zero_qtr_idx : pd.MultiIndex - The index of the row of the zeroth (immediately next/previous) - quarter from each date for each sid. - requested_qtr_idx : pd.MultiIndex - The index of the row of the requested quarter from each date for - each sid. - stacked_last_per_qtr : pd.DataFrame - The latest estimate known per sid per date per quarter with the - dates, normalized quarter, and sid as the index. + zero_qtr_data : pd.DataFrame + The 'time zero' data for each date/sid. + zero_qtr_data : pd.DataFrame + The data for the requested quarter. last_per_qtr : pd.DataFrame The latest estimate known per sid per date per quarter with dates as the index and normalized quarter and sid in the columns @@ -198,18 +192,11 @@ class QuarterEstimatesLoader(PipelineLoader): adjusted_array : AdjustedArray The array of data and overwrites for the given column. """ - adjustments = {} - requested_qtr_data = self.get_requested_data_for_col( - stacked_last_per_qtr, requested_qtr_idx, dates - ) - zero_qtr_data = stacked_last_per_qtr.loc[zero_qtr_idx] + col_to_adjustments = defaultdict(dict) # We no longer need this in the index, but we do need it as a column # to calculate adjustments. zero_qtr_data = zero_qtr_data.reset_index(NORMALIZED_QUARTERS) - if column.dtype == datetime64ns_dtype: - overwrite = Datetime641DArrayOverwrite - else: - overwrite = Float641DArrayOverwrite + for sid_idx, sid in enumerate(assets): zero_qtr_sid_data = zero_qtr_data[ zero_qtr_data.index.get_level_values(SID_FIELD_NAME) == sid @@ -243,38 +230,35 @@ class QuarterEstimatesLoader(PipelineLoader): if isinstance(self, PreviousQuartersEstimatesLoader) else 'right' ) - adjustments[next_qtr_start_idx] = \ - self.create_overwrite_for_quarter( - next_qtr_start_idx, - column, - column_name, - dates, - last_per_qtr, - overwrite, - qtrs_with_estimates_for_sid, - requested_qtr_data, - sid, - sid_idx, - ) - - return AdjustedArray( - requested_qtr_data[column_name].values.astype(column.dtype), - mask, - dict(adjustments), - column.missing_value, - ) + self.create_overwrite_for_quarter( + col_to_adjustments, + next_qtr_start_idx, + dates, + last_per_qtr, + qtrs_with_estimates_for_sid, + requested_qtr_data, + sid, + sid_idx, + columns, + ) + return col_to_adjustments def create_overwrite_for_quarter(self, + col_to_adjustments, next_qtr_start_idx, - column, - column_name, dates, last_per_qtr, - overwrite, quarters_with_estimates_for_sid, requested_qtr_data, sid, - sid_idx): + sid_idx, + columns): + overwrites_dict = {} + for col in columns: + if col.dtype == datetime64ns_dtype: + overwrites_dict[col] = Datetime641DArrayOverwrite + else: + overwrites_dict[col] = Float641DArrayOverwrite # Only add adjustments if the next quarter starts somewhere in # our date index for this sid. Our 'next' quarter can never # start at index 0; a starting index of 0 means that the next @@ -285,32 +269,35 @@ class QuarterEstimatesLoader(PipelineLoader): requested_quarter = requested_qtr_data[ SHIFTED_NORMALIZED_QTRS ][sid].iloc[next_qtr_start_idx] - - # If there are estimates for the requested quarter, - # overwrite all values going up to the starting index of - # that quarter with estimates for that quarter. - if requested_quarter in quarters_with_estimates_for_sid: - return self.create_overwrite_for_estimate( - column, - column_name, - last_per_qtr, - next_qtr_start_idx, - overwrite, - requested_quarter, - sid, - sid_idx - ) - # There are no estimates for the quarter. Overwrite all - # values going up to the starting index of that quarter - # with the missing value for this column. - else: - return self.overwrite_with_null( - column, - last_per_qtr, - next_qtr_start_idx, - overwrite, - sid_idx - ) + for col in columns: + column_name = self.name_map[col.name] + # If there are estimates for the requested quarter, + # overwrite all values going up to the starting index of + # that quarter with estimates for that quarter. + if requested_quarter in quarters_with_estimates_for_sid: + col_to_adjustments[column_name][next_qtr_start_idx] = \ + self.create_overwrite_for_estimate( + col, + column_name, + last_per_qtr, + next_qtr_start_idx, + overwrites_dict[col], + requested_quarter, + sid, + sid_idx + ) + # There are no estimates for the quarter. Overwrite all + # values going up to the starting index of that quarter + # with the missing value for this column. + else: + col_to_adjustments[column_name][next_qtr_start_idx] =\ + self.overwrite_with_null( + col, + last_per_qtr, + next_qtr_start_idx, + overwrites_dict[col], + sid_idx + ) def overwrite_with_null(self, column, @@ -353,7 +340,6 @@ class QuarterEstimatesLoader(PipelineLoader): ) out = {} - for num_quarters, columns in groups.items(): # Determine the last piece of information we know for each column # on each date in the index for each sid and quarter. @@ -388,19 +374,33 @@ class QuarterEstimatesLoader(PipelineLoader): ), shifted_qtr_data[SHIFTED_NORMALIZED_QTRS] ]).index + requested_qtr_data = self.get_requested_data_for_col( + stacked_last_per_qtr, requested_qtr_idx, dates + ) - for c in columns: - column_name = self.name_map[c.name] - adjusted_array = self.get_adjustments(zero_qtr_idx, - requested_qtr_idx, - stacked_last_per_qtr, + zero_qtr_data = stacked_last_per_qtr.loc[zero_qtr_idx] + + col_to_adjustments = self.get_adjustments(zero_qtr_data, + requested_qtr_data, last_per_qtr, dates, - column_name, - c, - mask, - assets) - out[c] = adjusted_array + assets, + columns) + for col in columns: + column_name = self.name_map[col.name] + # We may have dropped assets if they never have any data for the + # requested quarter. + df = pd.DataFrame(data=requested_qtr_data[column_name], + index=dates, + columns=assets, + dtype=col.dtype) + + out[col] = AdjustedArray( + df.values.astype(col.dtype), + mask, + dict(col_to_adjustments[column_name]), + col.missing_value, + ) return out From 65d15d3960d959ce205886ba3b1acfd6c35fb81e Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Wed, 14 Sep 2016 10:26:29 -0400 Subject: [PATCH 13/16] TST: add test for sid with no data MAINT: optimization - only look at assets appearing in data TST: simplify test DOC: add documentation for checkpoints MAINT: explicitly cast event date field to datetime MAINT: add back import TST: fix indexing to remove setting wtih copy warning --- tests/pipeline/test_quarters_estimates.py | 192 +++--- zipline/pipeline/loaders/blaze/core.py | 22 +- zipline/pipeline/loaders/blaze/estimates.py | 27 +- zipline/pipeline/loaders/blaze/events.py | 6 + zipline/pipeline/loaders/blaze/utils.py | 11 +- .../pipeline/loaders/earnings_estimates.py | 619 ++++++++++++++++++ zipline/pipeline/loaders/quarter_estimates.py | 529 --------------- zipline/pipeline/loaders/utils.py | 7 +- zipline/testing/fixtures.py | 5 +- 9 files changed, 776 insertions(+), 642 deletions(-) create mode 100644 zipline/pipeline/loaders/earnings_estimates.py delete mode 100644 zipline/pipeline/loaders/quarter_estimates.py diff --git a/tests/pipeline/test_quarters_estimates.py b/tests/pipeline/test_quarters_estimates.py index 56e7827b..4f7b8366 100644 --- a/tests/pipeline/test_quarters_estimates.py +++ b/tests/pipeline/test_quarters_estimates.py @@ -4,7 +4,6 @@ from nose.tools import assert_true from nose_parameterized import parameterized import numpy as np import pandas as pd -from pandas.util.testing import assert_frame_equal, assert_series_equal from toolz import merge from zipline.pipeline import SimplePipelineEngine, Pipeline, CustomFactor @@ -21,11 +20,11 @@ from zipline.pipeline.loaders.blaze.estimates import ( BlazeNextEstimatesLoader, BlazePreviousEstimatesLoader ) -from zipline.pipeline.loaders.quarter_estimates import ( +from zipline.pipeline.loaders.earnings_estimates import ( INVALID_NUM_QTRS_MESSAGE, - NextQuartersEstimatesLoader, + NextEarningsEstimatesLoader, normalize_quarters, - PreviousQuartersEstimatesLoader, + PreviousEarningsEstimatesLoader, split_normalized_quarters, ) from zipline.testing.fixtures import ( @@ -78,7 +77,7 @@ class WithEstimates(WithTradingSessions, WithAssetFinder): # Short window defined in order for test to run faster. START_DATE = pd.Timestamp('2014-12-28') - END_DATE = pd.Timestamp('2015-02-03') + END_DATE = pd.Timestamp('2015-02-04') @classmethod def make_loader(cls, events, columns): @@ -88,10 +87,14 @@ class WithEstimates(WithTradingSessions, WithAssetFinder): def make_events(cls): raise NotImplementedError('make_events') + @classmethod + def get_sids(cls): + return cls.events[SID_FIELD_NAME].unique() + @classmethod def init_class_fixtures(cls): cls.events = cls.make_events() - cls.sids = cls.events[SID_FIELD_NAME].unique() + cls.ASSET_FINDER_EQUITY_SIDS = cls.get_sids() cls.columns = { Estimates.event_date: 'event_date', Estimates.fiscal_quarter: 'fiscal_quarter', @@ -101,9 +104,6 @@ class WithEstimates(WithTradingSessions, WithAssetFinder): cls.loader = cls.make_loader(cls.events, {column.name: val for column, val in cls.columns.items()}) - cls.ASSET_FINDER_EQUITY_SIDS = list( - cls.events[SID_FIELD_NAME].unique() - ) cls.ASSET_FINDER_EQUITY_SYMBOLS = [ 's' + str(n) for n in cls.ASSET_FINDER_EQUITY_SIDS ] @@ -190,7 +190,7 @@ class PreviousWithWrongNumQuarters(WithWrongLoaderDefinition, """ @classmethod def make_loader(cls, events, columns): - return PreviousQuartersEstimatesLoader(events, columns) + return PreviousEarningsEstimatesLoader(events, columns) class NextWithWrongNumQuarters(WithWrongLoaderDefinition, @@ -201,7 +201,7 @@ class NextWithWrongNumQuarters(WithWrongLoaderDefinition, """ @classmethod def make_loader(cls, events, columns): - return NextQuartersEstimatesLoader(events, columns) + return NextEarningsEstimatesLoader(events, columns) class WithEstimatesTimeZero(WithEstimates): @@ -234,24 +234,27 @@ class WithEstimatesTimeZero(WithEstimates): Tests that we get the right 'time zero' value on each day for each sid and for each column. """ + # Shorter date range for performance + END_DATE = pd.Timestamp('2015-01-28') + q1_knowledge_dates = [pd.Timestamp('2015-01-01'), pd.Timestamp('2015-01-04'), - pd.Timestamp('2015-01-08'), - pd.Timestamp('2015-01-12')] - q2_knowledge_dates = [pd.Timestamp('2015-01-16'), + pd.Timestamp('2015-01-07'), + pd.Timestamp('2015-01-11')] + q2_knowledge_dates = [pd.Timestamp('2015-01-14'), + pd.Timestamp('2015-01-17'), pd.Timestamp('2015-01-20'), - pd.Timestamp('2015-01-24'), - pd.Timestamp('2015-01-28')] + pd.Timestamp('2015-01-23')] # We want to model the possibility of an estimate predicting a release date # that doesn't match the actual release. This could be done by dynamically # generating more combinations with different release dates, but that # significantly increases the amount of time it takes to run the tests. # These hard-coded cases are sufficient to know that we can update our # beliefs when we get new information. - q1_release_dates = [pd.Timestamp('2015-01-15'), - pd.Timestamp('2015-01-16')] # One day late - q2_release_dates = [pd.Timestamp('2015-01-30'), # One day early - pd.Timestamp('2015-01-31')] + q1_release_dates = [pd.Timestamp('2015-01-13'), + pd.Timestamp('2015-01-14')] # One day late + q2_release_dates = [pd.Timestamp('2015-01-25'), # One day early + pd.Timestamp('2015-01-26')] @classmethod def make_events(cls): @@ -300,8 +303,15 @@ class WithEstimatesTimeZero(WithEstimates): q2e2, sid)) sid_releases.append(cls.create_releases_df(sid)) + return pd.concat(sid_estimates + + sid_releases).reset_index(drop=True) - return pd.concat(sid_estimates + sid_releases).reset_index(drop=True) + @classmethod + def get_sids(cls): + sids = cls.events[SID_FIELD_NAME].unique() + # Tack on an extra sid to make sure that sids with no data are + # included but have all-null columns. + return list(sids) + [max(sids) + 1] @classmethod def create_releases_df(cls, sid): @@ -309,10 +319,10 @@ class WithEstimatesTimeZero(WithEstimates): # ranges in order to reduce the number of dates we need to iterate # through when testing. return pd.DataFrame({ - TS_FIELD_NAME: [pd.Timestamp('2015-01-15'), - pd.Timestamp('2015-01-31')], - EVENT_DATE_FIELD_NAME: [pd.Timestamp('2015-01-15'), - pd.Timestamp('2015-01-31')], + TS_FIELD_NAME: [pd.Timestamp('2015-01-13'), + pd.Timestamp('2015-01-26')], + EVENT_DATE_FIELD_NAME: [pd.Timestamp('2015-01-13'), + pd.Timestamp('2015-01-26')], 'estimate': [0.5, 0.8], FISCAL_QUARTER_FIELD_NAME: [1.0, 2.0], FISCAL_YEAR_FIELD_NAME: [2015.0, 2015.0], @@ -337,8 +347,6 @@ class WithEstimatesTimeZero(WithEstimates): @classmethod def init_class_fixtures(cls): - # Must be generated before call to super since super uses `events`. - cls.events = cls.make_events() super(WithEstimatesTimeZero, cls).init_class_fixtures() def get_expected_estimate(self, @@ -356,58 +364,42 @@ class WithEstimatesTimeZero(WithEstimates): ) results = engine.run_pipeline( Pipeline({c.name: c.latest for c in dataset.columns}), - start_date=self.trading_days[0], - end_date=self.trading_days[-1], + start_date=self.trading_days[1], + end_date=self.trading_days[-2], ) - for sid in self.sids: + for sid in self.ASSET_FINDER_EQUITY_SIDS: sid_estimates = results.xs(sid, level=1) - ts_sorted_estimates = self.events[ - self.events[SID_FIELD_NAME] == sid - ].sort(TS_FIELD_NAME) - for i, date in enumerate(sid_estimates.index): - comparable_date = date.tz_localize(None) - # Filter out estimates we don't know about yet. - ts_eligible_estimates = ts_sorted_estimates[ - ts_sorted_estimates[TS_FIELD_NAME] <= comparable_date + # Separate assertion for all-null DataFrame to avoid setting + # column dtypes on `all_expected`. + if sid == max(self.ASSET_FINDER_EQUITY_SIDS): + assert_true(sid_estimates.isnull().all().all()) + else: + ts_sorted_estimates = self.events[ + self.events[SID_FIELD_NAME] == sid + ].sort(TS_FIELD_NAME) + q1_knowledge = ts_sorted_estimates[ + ts_sorted_estimates[FISCAL_QUARTER_FIELD_NAME] == 1 ] - # If there are estimates we know about: - if not ts_eligible_estimates.empty: - # Determine the last piece of information we know about - # for q1 and q2. This takes advantage of the fact that we - # only have 2 quarters in the test data. - q1_knowledge = ts_eligible_estimates[ - ts_eligible_estimates[FISCAL_QUARTER_FIELD_NAME] == 1 - ] - q2_knowledge = ts_eligible_estimates[ - ts_eligible_estimates[FISCAL_QUARTER_FIELD_NAME] == 2 - ] - expected_estimate = self.get_expected_estimate( - q1_knowledge, - q2_knowledge, - comparable_date, - ) - # Have to explicitly check for None because - # `expected_estimate` might be a DataFrame. - if expected_estimate is not None: - assert_series_equal( - sid_estimates.iloc[i], - expected_estimate[sid_estimates.columns], - check_names=False - ) - else: - # There are no eligible 'next'/'previous' estimates on - # this day; everything should be null. - assert_true(sid_estimates.iloc[i].isnull().all()) - else: - # We don't know about any estimates on this day; - # everything should be null. - assert_true(sid_estimates.iloc[i].isnull().all()) + q2_knowledge = ts_sorted_estimates[ + ts_sorted_estimates[FISCAL_QUARTER_FIELD_NAME] == 2 + ] + all_expected = pd.concat( + [self.get_expected_estimate( + q1_knowledge[q1_knowledge[TS_FIELD_NAME] <= + date.tz_localize(None)], + q2_knowledge[q2_knowledge[TS_FIELD_NAME] <= + date.tz_localize(None)], + date.tz_localize(None), + ).set_index([[date]]) for date in sid_estimates.index], + axis=0) + assert_equal(all_expected[sid_estimates.columns], + sid_estimates) class NextEstimate(WithEstimatesTimeZero, ZiplineTestCase): @classmethod def make_loader(cls, events, columns): - return NextQuartersEstimatesLoader(events, columns) + return NextEarningsEstimatesLoader(events, columns) def get_expected_estimate(self, q1_knowledge, @@ -419,15 +411,16 @@ class NextEstimate(WithEstimatesTimeZero, ZiplineTestCase): if (not q1_knowledge.empty and q1_knowledge[EVENT_DATE_FIELD_NAME].iloc[-1] >= comparable_date): - return q1_knowledge.iloc[-1] + return q1_knowledge.iloc[-1:] # If q1 has already happened or we don't know about it # yet and our latest knowledge indicates that q2 hasn't # happened yet, then that's the estimate we want to use. elif (not q2_knowledge.empty and q2_knowledge[EVENT_DATE_FIELD_NAME].iloc[-1] >= comparable_date): - return q2_knowledge.iloc[-1] - return None + return q2_knowledge.iloc[-1:] + return pd.DataFrame(columns=q1_knowledge.columns, + index=[comparable_date]) class BlazeNextEstimateLoaderTestCase(NextEstimate): @@ -446,7 +439,7 @@ class BlazeNextEstimateLoaderTestCase(NextEstimate): class PreviousEstimate(WithEstimatesTimeZero, ZiplineTestCase): @classmethod def make_loader(cls, events, columns): - return PreviousQuartersEstimatesLoader(events, columns) + return PreviousEarningsEstimatesLoader(events, columns) def get_expected_estimate(self, q1_knowledge, @@ -460,12 +453,13 @@ class PreviousEstimate(WithEstimatesTimeZero, ZiplineTestCase): if (not q2_knowledge.empty and q2_knowledge[EVENT_DATE_FIELD_NAME].iloc[-1] <= comparable_date): - return q2_knowledge.iloc[-1] + return q2_knowledge.iloc[-1:] elif (not q1_knowledge.empty and q1_knowledge[EVENT_DATE_FIELD_NAME].iloc[-1] <= comparable_date): - return q1_knowledge.iloc[-1] - return None + return q1_knowledge.iloc[-1:] + return pd.DataFrame(columns=q1_knowledge.columns, + index=[comparable_date]) class BlazePreviousEstimateLoaderTestCase(PreviousEstimate): @@ -572,8 +566,8 @@ class WithEstimateMultipleQuarters(WithEstimates): # quarters out for each of the dataset columns. assert_equal(sorted(np.array(q1_columns + q2_columns)), sorted(results.columns.values)) - assert_frame_equal(self.expected_out.sort(axis=1), - results.xs(0, level=1).sort(axis=1)) + assert_equal(self.expected_out.sort(axis=1), + results.xs(0, level=1).sort(axis=1)) class NextEstimateMultipleQuarters( @@ -581,17 +575,19 @@ class NextEstimateMultipleQuarters( ): @classmethod def make_loader(cls, events, columns): - return NextQuartersEstimatesLoader(events, columns) + return NextEarningsEstimatesLoader(events, columns) @classmethod def fill_expected_out(cls, expected): # Fill columns for 1 Q out for raw_name in cls.columns.values(): - expected[raw_name + '1'].loc[ - pd.Timestamp('2015-01-01'):pd.Timestamp('2015-01-11') + expected.loc[ + pd.Timestamp('2015-01-01'):pd.Timestamp('2015-01-11'), + raw_name + '1' ] = cls.events[raw_name].iloc[0] - expected[raw_name + '1'].loc[ - pd.Timestamp('2015-01-11'):pd.Timestamp('2015-01-20') + expected.loc[ + pd.Timestamp('2015-01-11'):pd.Timestamp('2015-01-20'), + raw_name + '1' ] = cls.events[raw_name].iloc[1] # Fill columns for 2 Q out @@ -599,19 +595,23 @@ class NextEstimateMultipleQuarters( # Q1's event happens; after Q1's event, we know 1 Q out but not 2 Qs # out. for col_name in ['estimate', 'event_date']: - expected[col_name + '2'].loc[ - pd.Timestamp('2015-01-06'):pd.Timestamp('2015-01-10') + expected.loc[ + pd.Timestamp('2015-01-06'):pd.Timestamp('2015-01-10'), + col_name + '2' ] = cls.events[col_name].iloc[1] # But we know what FQ and FY we'd need in both Q1 and Q2 # because we know which FQ is next and can calculate from there - expected[FISCAL_QUARTER_FIELD_NAME + '2'].loc[ - pd.Timestamp('2015-01-01'):pd.Timestamp('2015-01-09') + expected.loc[ + pd.Timestamp('2015-01-01'):pd.Timestamp('2015-01-09'), + FISCAL_QUARTER_FIELD_NAME + '2' ] = 2 - expected[FISCAL_QUARTER_FIELD_NAME + '2'].loc[ - pd.Timestamp('2015-01-12'):pd.Timestamp('2015-01-20') + expected.loc[ + pd.Timestamp('2015-01-12'):pd.Timestamp('2015-01-20'), + FISCAL_QUARTER_FIELD_NAME + '2' ] = 3 - expected[FISCAL_YEAR_FIELD_NAME + '2'].loc[ - pd.Timestamp('2015-01-01'):pd.Timestamp('2015-01-20') + expected.loc[ + pd.Timestamp('2015-01-01'):pd.Timestamp('2015-01-20'), + FISCAL_YEAR_FIELD_NAME + '2' ] = 2015 return expected @@ -624,7 +624,7 @@ class PreviousEstimateMultipleQuarters( @classmethod def make_loader(cls, events, columns): - return PreviousQuartersEstimatesLoader(events, columns) + return PreviousEarningsEstimatesLoader(events, columns) @classmethod def fill_expected_out(cls, expected): @@ -804,7 +804,7 @@ class WithEstimateWindows(WithEstimates): class PreviousEstimateWindows(WithEstimateWindows, ZiplineTestCase): @classmethod def make_loader(cls, events, columns): - return PreviousQuartersEstimatesLoader(events, columns) + return PreviousEarningsEstimatesLoader(events, columns) @classmethod def make_expected_timelines(cls): @@ -867,7 +867,7 @@ class PreviousEstimateWindows(WithEstimateWindows, ZiplineTestCase): class NextEstimateWindows(WithEstimateWindows, ZiplineTestCase): @classmethod def make_loader(cls, events, columns): - return NextQuartersEstimatesLoader(events, columns) + return NextEarningsEstimatesLoader(events, columns) @classmethod def make_expected_timelines(cls): diff --git a/zipline/pipeline/loaders/blaze/core.py b/zipline/pipeline/loaders/blaze/core.py index 61782c31..d5c3e40b 100644 --- a/zipline/pipeline/loaders/blaze/core.py +++ b/zipline/pipeline/loaders/blaze/core.py @@ -1028,7 +1028,6 @@ class BlazeLoader(dict): return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) - lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs ) @@ -1158,6 +1157,22 @@ def bind_expression_to_resources(expr, resources): def get_materialized_checkpoints(checkpoints, colnames, lower_dt, odo_kwargs): + """ + Computes a lower bound and a DataFrame checkpoints. + + Parameters + ---------- + checkpoints : Expr + Bound blaze expression for a checkpoints table from which to get a + computed lower bound. + colnames : iterable of str + The names of the columns for which checkpoints should be computed. + lower_dt : pd.Timestamp + The lower date being queried for that serves as an upper bound for + checkpoints. + odo_kwargs : dict, optional + The extra keyword arguments to pass to ``odo``. + """ if checkpoints is not None: ts = checkpoints[TS_FIELD_NAME] checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp) @@ -1194,12 +1209,13 @@ def ffill_query_in_range(expr, The lower date to query for. upper : datetime The upper date to query for. + checkpoints : Expr, optional + Bound blaze expression for a checkpoints table from which to get a + computed lower bound. odo_kwargs : dict, optional The extra keyword arguments to pass to ``odo``. ts_field : str, optional The name of the timestamp field in the given blaze expression. - sid_field : str, optional - The name of the sid field in the given blaze expression. Returns ------- diff --git a/zipline/pipeline/loaders/blaze/estimates.py b/zipline/pipeline/loaders/blaze/estimates.py index b991c62c..b63c58a6 100644 --- a/zipline/pipeline/loaders/blaze/estimates.py +++ b/zipline/pipeline/loaders/blaze/estimates.py @@ -3,11 +3,18 @@ from datashape import istabular from .core import ( bind_expression_to_resources, ) +from zipline.pipeline.common import ( + EVENT_DATE_FIELD_NAME, + FISCAL_QUARTER_FIELD_NAME, + FISCAL_YEAR_FIELD_NAME, + SID_FIELD_NAME, + TS_FIELD_NAME, +) from zipline.pipeline.loaders.base import PipelineLoader from zipline.pipeline.loaders.blaze.utils import load_raw_data -from zipline.pipeline.loaders.quarter_estimates import ( - NextQuartersEstimatesLoader, - PreviousQuartersEstimatesLoader, +from zipline.pipeline.loaders.earnings_estimates import ( + NextEarningsEstimatesLoader, + PreviousEarningsEstimatesLoader, required_estimates_fields, ) from zipline.pipeline.loaders.utils import ( @@ -35,6 +42,9 @@ class BlazeEstimatesLoader(PipelineLoader): The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezeone to use for the data query cutoff. + checkpoints : Expr, optional + The expression representing checkpointed data to be used for faster + forward-filling of data from `expr`. Notes ----- @@ -55,6 +65,13 @@ class BlazeEstimatesLoader(PipelineLoader): If the '{TS_FIELD_NAME}' field is not included it is assumed that we start the backtest with knowledge of all announcements. """ + __doc__ = __doc__.format( + SID_FIELD_NAME=SID_FIELD_NAME, + TS_FIELD_NAME=TS_FIELD_NAME, + FISCAL_YEAR_FIELD_NAME=FISCAL_YEAR_FIELD_NAME, + FISCAL_QUARTER_FIELD_NAME=FISCAL_QUARTER_FIELD_NAME, + EVENT_DATE_FIELD_NAME=EVENT_DATE_FIELD_NAME, + ) @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, @@ -107,8 +124,8 @@ class BlazeEstimatesLoader(PipelineLoader): class BlazeNextEstimatesLoader(BlazeEstimatesLoader): - loader = NextQuartersEstimatesLoader + loader = NextEarningsEstimatesLoader class BlazePreviousEstimatesLoader(BlazeEstimatesLoader): - loader = PreviousQuartersEstimatesLoader + loader = PreviousEarningsEstimatesLoader diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index 56c050f2..6ecf3c6f 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -3,6 +3,8 @@ from datashape import istabular from .core import ( bind_expression_to_resources, ) +from zipline.pipeline.common import SID_FIELD_NAME, TS_FIELD_NAME, \ + EVENT_DATE_FIELD_NAME from zipline.pipeline.loaders.base import PipelineLoader from zipline.pipeline.loaders.blaze.utils import load_raw_data from zipline.pipeline.loaders.events import ( @@ -55,6 +57,10 @@ class BlazeEventsLoader(PipelineLoader): start the backtest with knowledge of all announcements. """ + __doc__ == __doc__.format(SID_FIELD_NAME=SID_FIELD_NAME, + TS_FIELD_NAME=TS_FIELD_NAME, + EVENT_DATE_FIELD_NAME=EVENT_DATE_FIELD_NAME) + @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, expr, diff --git a/zipline/pipeline/loaders/blaze/utils.py b/zipline/pipeline/loaders/blaze/utils.py index b5be9cd9..ab186c82 100644 --- a/zipline/pipeline/loaders/blaze/utils.py +++ b/zipline/pipeline/loaders/blaze/utils.py @@ -14,10 +14,11 @@ def load_raw_data(assets, odo_kwargs, checkpoints=None): """ - given an expression representing data to load, perform normalization and - forward-filling and return the data, materialized. + Given an expression representing data to load, perform normalization and + forward-filling and return the data, materialized. Only accepts data with a + `sid` field. - parameters + Parameters ---------- assets : pd.int64index the assets to load data for. @@ -32,8 +33,10 @@ def load_raw_data(assets, the expression representing the data to load. odo_kwargs : dict extra keyword arguments to pass to odo when executing the expression. + checkpoints : expr, optional + the expression representing the checkpointed data for `expr`. - returns + Returns ------- raw : pd.dataframe The result of computing expr and materializing the result as a diff --git a/zipline/pipeline/loaders/earnings_estimates.py b/zipline/pipeline/loaders/earnings_estimates.py new file mode 100644 index 00000000..3397d105 --- /dev/null +++ b/zipline/pipeline/loaders/earnings_estimates.py @@ -0,0 +1,619 @@ +from collections import defaultdict +from abc import abstractmethod, abstractproperty +import pandas as pd +from six import viewvalues +from toolz import groupby + +from zipline.lib.adjusted_array import AdjustedArray +from zipline.lib.adjustment import ( + Datetime641DArrayOverwrite, + Datetime64Overwrite, + Float641DArrayOverwrite, + Float64Overwrite, +) + +from zipline.pipeline.common import ( + EVENT_DATE_FIELD_NAME, + FISCAL_QUARTER_FIELD_NAME, + FISCAL_YEAR_FIELD_NAME, + SID_FIELD_NAME, + TS_FIELD_NAME, +) +from zipline.pipeline.loaders.base import PipelineLoader +from zipline.utils.numpy_utils import datetime64ns_dtype, float64_dtype +from zipline.pipeline.loaders.utils import ( + ffill_across_cols, + last_in_date_group +) + + +INVALID_NUM_QTRS_MESSAGE = "Passed invalid number of quarters %s; " \ + "must pass a number of quarters >= 0" +NEXT_FISCAL_QUARTER = 'next_fiscal_quarter' +NEXT_FISCAL_YEAR = 'next_fiscal_year' +NORMALIZED_QUARTERS = 'normalized_quarters' +PREVIOUS_FISCAL_QUARTER = 'previous_fiscal_quarter' +PREVIOUS_FISCAL_YEAR = 'previous_fiscal_year' +SHIFTED_NORMALIZED_QTRS = 'shifted_normalized_quarters' +SIMULTATION_DATES = 'dates' + + +def normalize_quarters(years, quarters): + return years * 4 + quarters - 1 + + +def split_normalized_quarters(normalized_quarters): + years = normalized_quarters // 4 + quarters = normalized_quarters % 4 + return years, quarters + 1 + + +def required_estimates_fields(columns): + """ + Compute the set of resource columns required to serve + `columns`. + """ + # These metadata columns are used to align event indexers. + return { + TS_FIELD_NAME, + SID_FIELD_NAME, + EVENT_DATE_FIELD_NAME, + FISCAL_QUARTER_FIELD_NAME, + FISCAL_YEAR_FIELD_NAME + }.union( + # We also expect any of the field names that our loadable columns + # are mapped to. + viewvalues(columns), + ) + + +def validate_column_specs(events, columns): + """ + Verify that the columns of ``events`` can be used by a + EarningsEstimatesLoader to serve the BoundColumns described by + `columns`. + """ + required = required_estimates_fields(columns) + received = set(events.columns) + missing = required - received + if missing: + raise ValueError( + "EarningsEstimatesLoader missing required columns {missing}.\n" + "Got Columns: {received}\n" + "Expected Columns: {required}".format( + missing=sorted(missing), + received=sorted(received), + required=sorted(required), + ) + ) + + +class EarningsEstimatesLoader(PipelineLoader): + """ + An abstract pipeline loader for estimates data that can load data a + variable number of quarters forwards/backwards from calendar dates + depending on the `num_quarters` attribute of the columns' dataset. + + Parameters + ---------- + estimates : pd.DataFrame + The raw estimates data. + ``estimates`` must contain at least 5 columns: + sid : int64 + The asset id associated with each estimate. + + event_date : datetime64[ns] + The date on which the event that the estimate is for will/has + occurred.. + + timestamp : datetime64[ns] + The date on which we learned about the estimate. + + fiscal_quarter : int64 + The quarter during which the event has/will occur. + + fiscal_year : int64 + The year during which the event has/will occur. + + name_map : dict[str -> str] + A map of names of BoundColumns that this loader will load to the + names of the corresponding columns in `events`. + """ + def __init__(self, + estimates, + name_map): + validate_column_specs( + estimates, + name_map + ) + + self.estimates = estimates[ + estimates[EVENT_DATE_FIELD_NAME].notnull() & + estimates[FISCAL_QUARTER_FIELD_NAME].notnull() & + estimates[FISCAL_YEAR_FIELD_NAME].notnull() + ] + self.estimates[NORMALIZED_QUARTERS] = normalize_quarters( + self.estimates[FISCAL_YEAR_FIELD_NAME], + self.estimates[FISCAL_QUARTER_FIELD_NAME], + ) + self.array_overwrites_dict = {datetime64ns_dtype: + Datetime641DArrayOverwrite, + float64_dtype: Float641DArrayOverwrite} + self.scalar_overwrites_dict = {datetime64ns_dtype: Datetime64Overwrite, + float64_dtype: Float64Overwrite} + + self.name_map = name_map + + @abstractmethod + def get_zeroth_quarter_idx(self, num_quarters, last, dates): + raise NotImplementedError('get_zeroth_quarter_idx') + + @abstractmethod + def get_shifted_qtrs(self, zero_qtrs, num_quarters): + raise NotImplementedError('get_shifted_qtrs') + + @abstractmethod + def create_overwrite_for_estimate(self, + column, + column_name, + last_per_qtr, + next_qtr_start_idx, + requested_quarter, + sid, + sid_idx): + raise NotImplementedError('create_overwrite_for_estimate') + + @abstractproperty + def searchsorted_side(self): + return NotImplementedError('searchsorted_side') + + def get_requested_quarter_data(self, stacked_last_per_qtr, idx, dates): + """ + Selects the requested data for each date. + + Parameters + ---------- + stacked_last_per_qtr : pd.DataFrame + The latest estimate known with the dates, normalized quarter, and + sid as the index. + idx : pd.MultiIndex + The index of the row of the requested quarter from each date for + each sid. + dates : pd.DatetimeIndex + The calendar dates for which estimates data is requested. + + Returns + -------- + requested_qtr_data : pd.DataFrame + The DataFrame with the latest values for the requested quarter + for all columns; `dates` are the index and columns are a MultiIndex + with sids at the top level and the dataset columns on the bottom. + """ + requested_qtr_data = stacked_last_per_qtr.loc[idx] + # We've lost the index names when doing `loc`, so set them here. + requested_qtr_data.index = requested_qtr_data.index.set_names( + idx.names + ) + requested_qtr_data = requested_qtr_data.reset_index( + SHIFTED_NORMALIZED_QTRS + ) + # Calculate the actual year/quarter being requested and add those in + # as columns. + (requested_qtr_data[FISCAL_YEAR_FIELD_NAME], + requested_qtr_data[FISCAL_QUARTER_FIELD_NAME]) = \ + split_normalized_quarters( + requested_qtr_data[SHIFTED_NORMALIZED_QTRS] + ) + # Once we're left with just dates as the index, we can reindex by all + # dates so that we have a value for each calendar date. + return requested_qtr_data.unstack(SID_FIELD_NAME).reindex(dates) + + def get_adjustments(self, + zero_qtr_data, + requested_qtr_data, + last_per_qtr, + dates, + assets, + columns): + """ + Creates an AdjustedArray from the given estimates data for the given + dates. + + Parameters + ---------- + zero_qtr_data : pd.DataFrame + The 'time zero' data for each calendar date per sid. + requested_qtr_data : pd.DataFrame + The requested quarter data for each calendar date per sid. + last_per_qtr : pd.DataFrame + A DataFrame with a column MultiIndex of [self.estimates.columns, + normalized_quarters, sid] that allows easily getting the timeline + of estimates for a particular sid for a particular quarter. + dates : pd.DatetimeIndex + The calendar dates for which estimates data is requested. + assets : pd.Int64Index + An index of all the assets from the raw data. + columns : list of BoundColumn + The columns for which adjustments need to be calculated. + + Returns + ------- + adjusted_array : AdjustedArray + The array of data and overwrites for the given column. + """ + col_to_overwrites = defaultdict(dict) + # We no longer need NORMALIZED_QUARTERS in the index, but we do need it + # as a column to calculate adjustments. + zero_qtr_data = zero_qtr_data.reset_index(NORMALIZED_QUARTERS) + + for sid_idx, sid in enumerate(assets): + zero_qtr_sid_data = zero_qtr_data[ + zero_qtr_data.index.get_level_values(SID_FIELD_NAME) == sid + ] + # Determine where quarters are changing for this sid. + qtr_shifts = zero_qtr_sid_data[ + zero_qtr_sid_data[NORMALIZED_QUARTERS] != + zero_qtr_sid_data[NORMALIZED_QUARTERS].shift(1) + ] + # On dates where we don't have any information about quarters, + # we will get nulls, and each of these will be interpreted as + # quarter shifts. We need to remove these here. + qtr_shifts = qtr_shifts[ + qtr_shifts[NORMALIZED_QUARTERS].notnull() + ] + # For the given sid, determine which quarters we have estimates + # for. + qtrs_with_estimates_for_sid = last_per_qtr.xs( + sid, axis=1, level=SID_FIELD_NAME + ).groupby(axis=1, level=1).first().columns.values + for row_indexer in list(qtr_shifts.index): + # Find the starting index of the quarter that comes right + # after this row. This isn't the starting index of the + # requested quarter, but simply the date we cross over into a + # new quarter. + next_qtr_start_idx = dates.searchsorted( + zero_qtr_data.loc[ + row_indexer + ][EVENT_DATE_FIELD_NAME], + side=self.searchsorted_side + ) + # Only add adjustments if the next quarter starts somewhere in + # our date index for this sid. Our 'next' quarter can never + # start at index 0; a starting index of 0 means that the next + # quarter's event date was NaT. + if 0 < next_qtr_start_idx < len(dates): + self.create_overwrite_for_quarter( + col_to_overwrites, + next_qtr_start_idx, + last_per_qtr, + qtrs_with_estimates_for_sid, + requested_qtr_data, + sid, + sid_idx, + columns, + ) + return col_to_overwrites + + def create_overwrite_for_quarter(self, + col_to_overwrites, + next_qtr_start_idx, + last_per_qtr, + quarters_with_estimates_for_sid, + requested_qtr_data, + sid, + sid_idx, + columns): + """ + Add entries to the dictionary of columns to adjustments for the given + sid and the given quarter. + + Parameters + ---------- + col_to_overwrites : dict [column_name -> list of ArrayAdjustment] + A dictionary mapping column names to all overwrites for those + columns. + next_qtr_start_idx : int + The index of the first day of the next quarter in the calendar + dates. + last_per_qtr : pd.DataFrame + A DataFrame with a column MultiIndex of [self.estimates.columns, + normalized_quarters, sid] that allows easily getting the timeline + of estimates for a particular sid for a particular quarter; this + is particularly useful for getting adjustments for 'next' + estimates. + quarters_with_estimates_for_sid : np.array + An array of all quarters for which there are estimates for the + given sid. + sid : int + The sid for which to create overwrites. + sid_idx : int + The index of the sid in `assets`. + columns : list of BoundColumn + The columns for which to create overwrites. + """ + + # Find the quarter being requested in the quarter we're + # crossing into. + requested_quarter = requested_qtr_data[ + SHIFTED_NORMALIZED_QTRS + ][sid].iloc[next_qtr_start_idx] + for col in columns: + column_name = self.name_map[col.name] + # If there are estimates for the requested quarter, + # overwrite all values going up to the starting index of + # that quarter with estimates for that quarter. + if requested_quarter in quarters_with_estimates_for_sid: + col_to_overwrites[column_name][next_qtr_start_idx] = \ + [self.create_overwrite_for_estimate( + col, + column_name, + last_per_qtr, + next_qtr_start_idx, + requested_quarter, + sid, + sid_idx + )] + # There are no estimates for the quarter. Overwrite all + # values going up to the starting index of that quarter + # with the missing value for this column. + else: + col_to_overwrites[column_name][next_qtr_start_idx] =\ + [self.overwrite_with_null( + col, + last_per_qtr.index, + next_qtr_start_idx, + sid_idx + )] + + def overwrite_with_null(self, + column, + dates, + next_qtr_start_idx, + sid_idx): + return self.scalar_overwrites_dict[column.dtype]( + 0, + next_qtr_start_idx - 1, + sid_idx, + sid_idx, + column.missing_value + ) + + def load_adjusted_array(self, columns, dates, assets, mask): + # Separate out getting the columns' datasets and the datasets' + # num_quarters attributes to ensure that we're catching the right + # AttributeError. + col_to_datasets = {col: col.dataset for col in columns} + try: + groups = groupby(lambda col: col_to_datasets[col].num_quarters, + col_to_datasets) + except AttributeError: + raise AttributeError("Datasets loaded via the " + "EarningsEstimatesLoader must define a " + "`num_quarters` attribute that defines how " + "many quarters out the loader should load " + "the data relative to `dates`.") + if any(num_qtr < 0 for num_qtr in groups): + raise ValueError( + INVALID_NUM_QTRS_MESSAGE % ','.join( + str(qtr) for qtr in groups if qtr < 0 + ) + + ) + out = {} + # To optimize performance, only work below on assets that are + # actually in the raw data. + assets_with_data = set(assets) & set(self.estimates[SID_FIELD_NAME]) + for num_quarters, columns in groups.items(): + last_per_qtr, stacked_last_per_qtr = self.get_last_data_per_qtr( + assets_with_data, columns, dates + ) + # Determine which quarter is immediately next/previous for each + # date. + zeroth_quarter_idx = self.get_zeroth_quarter_idx( + num_quarters, stacked_last_per_qtr + ) + zero_qtr_data = stacked_last_per_qtr.loc[zeroth_quarter_idx] + # Doing it this way because creating a MultiIndex from scratch + # results in being unable to unstack sids because of duplicate + # values, even though the MultiIndex is created with the same + # exact values as below - possible pandas bug. + requested_qtr_idx = zero_qtr_data.reset_index( + NORMALIZED_QUARTERS + ).set_index( + self.get_shifted_qtrs( + zeroth_quarter_idx.get_level_values(NORMALIZED_QUARTERS), + num_quarters + ), + append=True + ).index + requested_qtr_idx = requested_qtr_idx.rename( + SHIFTED_NORMALIZED_QTRS, -1 + ) + requested_qtr_data = self.get_requested_quarter_data( + stacked_last_per_qtr, requested_qtr_idx, dates + ) + + # Calculate all adjustments for the given quarter and accumulate + # them for each column. + col_to_adjustments = self.get_adjustments(zero_qtr_data, + requested_qtr_data, + last_per_qtr, + dates, + assets_with_data, + columns) + for col in columns: + column_name = self.name_map[col.name] + # We may have dropped assets if they never have any data for + # the requested quarter. + df = pd.DataFrame(data=requested_qtr_data[column_name], + index=dates, + columns=assets, + dtype=col.dtype) + + out[col] = AdjustedArray( + df.values.astype(col.dtype), + mask, + dict(col_to_adjustments[column_name]), + col.missing_value, + ) + return out + + def get_last_data_per_qtr(self, assets_with_data, columns, dates): + """ + Determine the last piece of information we know for each column on each + date in the index for each sid and quarter. + + Parameters + ---------- + assets_with_data : pd.Index + Index of all assets that appear in the raw data given to the + loader. + columns : iterable of BoundColumn + The columns that need to be loaded from the raw data. + dates : pd.DatetimeIndex + The calendar of dates for which data should be loaded. + + Returns + ------- + stacked_last_per_qtr : pd.DataFrame + A DataFrame indexed by [dates, sid, normalized_quarters] that has + the latest information for each row of the index, sorted by event + date. + last_per_qtr : pd.DataFrame + A DataFrame with columns that are a MultiIndex of [ + self.estimates.columns, normalized_quarters, sid]. + """ + # Get a DataFrame indexed by date with a MultiIndex of columns of [ + # self.estimates.columns, normalized_quarters, sid], where each cell + # contains the latest data for that day. + last_per_qtr = last_in_date_group( + self.estimates, dates, assets_with_data, reindex=True, + extra_groupers=[NORMALIZED_QUARTERS] + ) + # Forward fill values for each quarter/sid/dataset column. + ffill_across_cols(last_per_qtr, columns, self.name_map) + # Stack quarter and sid into the index. + stacked_last_per_qtr = last_per_qtr.stack([SID_FIELD_NAME, + NORMALIZED_QUARTERS]) + # Set date index name for ease of reference + stacked_last_per_qtr.index.set_names(SIMULTATION_DATES, + level=0, + inplace=True) + stacked_last_per_qtr = stacked_last_per_qtr.sort( + EVENT_DATE_FIELD_NAME + ) + stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] = pd.to_datetime( + stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] + ) + return last_per_qtr, stacked_last_per_qtr + + +class NextEarningsEstimatesLoader(EarningsEstimatesLoader): + @property + def searchsorted_side(self): + return 'right' + + def create_overwrite_for_estimate(self, + column, + column_name, + last_per_qtr, + next_qtr_start_idx, + requested_quarter, + sid, + sid_idx): + return self.array_overwrites_dict[column.dtype]( + 0, + # overwrite thru last qtr + next_qtr_start_idx - 1, + sid_idx, + sid_idx, + last_per_qtr[ + column_name, + requested_quarter, + sid + ][:next_qtr_start_idx].values) + + def get_shifted_qtrs(self, zero_qtrs, num_quarters): + return zero_qtrs + (num_quarters - 1) + + def get_zeroth_quarter_idx(self, num_quarters, stacked_last_per_qtr): + """ + Filters for releases that are on or after each simulation date and + determines the next quarter by picking out the upcoming release for + each date in the index. + + Parameters + ---------- + num_quarters : int + Number of quarters to go out in the future. + stacked_last_per_qtr : pd.DataFrame + A DataFrame with index of calendar dates, sid, and normalized + quarters with each row being the latest estimate for the row's + index values, sorted by event date. + + Returns + ------- + next_releases_per_date_index : pd.MultiIndex + An index of calendar dates, sid, and normalized quarters, for only + the rows that have a next event. + """ + + next_releases_per_date = stacked_last_per_qtr.loc[ + stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] >= + stacked_last_per_qtr.index.get_level_values(SIMULTATION_DATES) + ].groupby( + level=[SIMULTATION_DATES, SID_FIELD_NAME], as_index=False + ).nth(0) + return next_releases_per_date.index + + +class PreviousEarningsEstimatesLoader(EarningsEstimatesLoader): + @property + def searchsorted_side(self): + return 'left' + + def create_overwrite_for_estimate(self, + column, + column_name, + dates, + next_qtr_start_idx, + requested_quarter, + sid, + sid_idx): + return self.overwrite_with_null(column, + dates, + next_qtr_start_idx, + sid_idx) + + def get_shifted_qtrs(self, zero_qtrs, num_quarters): + return zero_qtrs - (num_quarters - 1) + + def get_zeroth_quarter_idx(self, num_quarters, stacked_last_per_qtr): + """ + Filters for releases that are on or after each simulation date and + determines the previous quarter by picking out the most recent + release relative to each date in the index. + + Parameters + ---------- + num_quarters : int + Number of quarters to go out in the past. + stacked_last_per_qtr : pd.DataFrame + A DataFrame with index of calendar dates, sid, and normalized + quarters with each row being the latest estimate for the row's + index values, sorted by event date. + + Returns + ------- + previous_releases_per_date_index : pd.MultiIndex + An index of calendar dates, sid, and normalized quarters, for only + the rows that have a previous event. + """ + + previous_releases_per_date = stacked_last_per_qtr.loc[ + stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] <= + stacked_last_per_qtr.index.get_level_values(SIMULTATION_DATES) + ].groupby( + level=[SIMULTATION_DATES, SID_FIELD_NAME], as_index=False + ).nth(-1) + return previous_releases_per_date.index diff --git a/zipline/pipeline/loaders/quarter_estimates.py b/zipline/pipeline/loaders/quarter_estimates.py deleted file mode 100644 index 71ff3a32..00000000 --- a/zipline/pipeline/loaders/quarter_estimates.py +++ /dev/null @@ -1,529 +0,0 @@ -from collections import defaultdict -from abc import abstractmethod -import numpy as np -import pandas as pd -from six import viewvalues -from toolz import groupby - -from zipline.lib.adjusted_array import AdjustedArray -from zipline.lib.adjustment import (Datetime641DArrayOverwrite, - Float641DArrayOverwrite) - -from zipline.pipeline.common import ( - EVENT_DATE_FIELD_NAME, - FISCAL_QUARTER_FIELD_NAME, - FISCAL_YEAR_FIELD_NAME, - SID_FIELD_NAME, - TS_FIELD_NAME, -) -from zipline.pipeline.loaders.base import PipelineLoader -from zipline.utils.numpy_utils import datetime64ns_dtype -from zipline.pipeline.loaders.utils import ( - ffill_across_cols, - last_in_date_group -) - - -INVALID_NUM_QTRS_MESSAGE = "Passed invalid number of quarters %s; " \ - "must pass a number of quarters >= 0" -NEXT_FISCAL_QUARTER = 'next_fiscal_quarter' -NEXT_FISCAL_YEAR = 'next_fiscal_year' -NORMALIZED_QUARTERS = 'normalized_quarters' -PREVIOUS_FISCAL_QUARTER = 'previous_fiscal_quarter' -PREVIOUS_FISCAL_YEAR = 'previous_fiscal_year' -SHIFTED_NORMALIZED_QTRS = 'shifted_normalized_quarters' -SIMULTATION_DATES = 'dates' - - -def normalize_quarters(years, quarters): - return years * 4 + quarters - 1 - - -def split_normalized_quarters(normalized_quarters): - years = normalized_quarters // 4 - quarters = normalized_quarters % 4 - return years, quarters + 1 - - -def required_estimates_fields(columns): - """ - Compute the set of resource columns required to serve - `columns`. - """ - # These metadata columns are used to align event indexers. - return { - TS_FIELD_NAME, - SID_FIELD_NAME, - EVENT_DATE_FIELD_NAME, - FISCAL_QUARTER_FIELD_NAME, - FISCAL_YEAR_FIELD_NAME - }.union( - # We also expect any of the field names that our loadable columns - # are mapped to. - viewvalues(columns), - ) - - -def validate_column_specs(events, columns): - """ - Verify that the columns of ``events`` can be used by a - QuarterEstimatesLoader to serve the BoundColumns described by - `columns`. - """ - required = required_estimates_fields(columns) - received = set(events.columns) - missing = required - received - if missing: - raise ValueError( - "QuarterEstimatesLoader missing required columns {missing}.\n" - "Got Columns: {received}\n" - "Expected Columns: {required}".format( - missing=sorted(missing), - received=sorted(received), - required=sorted(required), - ) - ) - - -class QuarterEstimatesLoader(PipelineLoader): - def __init__(self, - estimates, - name_map): - validate_column_specs( - estimates, - name_map - ) - - self.estimates = estimates[ - estimates[EVENT_DATE_FIELD_NAME].notnull() & - estimates[FISCAL_QUARTER_FIELD_NAME].notnull() & - estimates[FISCAL_YEAR_FIELD_NAME].notnull() - ] - self.estimates[NORMALIZED_QUARTERS] = normalize_quarters( - self.estimates[FISCAL_YEAR_FIELD_NAME], - self.estimates[FISCAL_QUARTER_FIELD_NAME], - ) - - self.name_map = name_map - - @abstractmethod - def load_quarters(self, num_quarters, last, dates): - raise NotImplementedError('load_quarters') - - def get_requested_data_for_col(self, stacked_last_per_qtr, idx, dates): - """ - Selects the requested data for each date. - - Parameters - ---------- - stacked_last_per_qtr : pd.DataFrame - The latest estimate known with the dates, normalized quarter, and - sid as the index. - idx : pd.MultiIndex - The index of the row of the requested quarter from each date for - each sid. - dates : pd.DatetimeIndex - The calendar dates for which estimates data is requested. - - Returns - -------- - requested_qtr_data : pd.DataFrame - The DataFrame with the latest values for the requested quarter - for all columns; `dates` are the index and columns are a MultiIndex - with sids at the top level and the dataset columns on the bottom. - """ - requested_qtr_data = stacked_last_per_qtr.loc[idx] - # We no longer need the shifted normalized quarters in the index, but - # we do need it as a column to calculate adjustments. - requested_qtr_data = requested_qtr_data.reset_index( - SHIFTED_NORMALIZED_QTRS - ) - # Calculate the actual year/quarter being requested and add those in - # as columns. - (requested_qtr_data[FISCAL_YEAR_FIELD_NAME], - requested_qtr_data[FISCAL_QUARTER_FIELD_NAME]) = \ - split_normalized_quarters( - requested_qtr_data[SHIFTED_NORMALIZED_QTRS] - ) - # Move sids into the columns. Once we're left with just dates - # as the index, we can reindex by all dates so that we have a - # value for each calendar date. - requested_qtr_data = requested_qtr_data.unstack( - SID_FIELD_NAME - ).reindex(dates) - return requested_qtr_data - - def get_adjustments(self, - zero_qtr_data, - requested_qtr_data, - last_per_qtr, - dates, - assets, - columns): - """ - Creates an AdjustedArray from the given estimates data for the given - dates. - - Parameters - ---------- - zero_qtr_data : pd.DataFrame - The 'time zero' data for each date/sid. - zero_qtr_data : pd.DataFrame - The data for the requested quarter. - last_per_qtr : pd.DataFrame - The latest estimate known per sid per date per quarter with - dates as the index and normalized quarter and sid in the columns - MultiIndex; allows easy access to the timeline of estimates - across all dates for a sid for a particular quarter. - dates : pd.DatetimeIndex - The calendar dates for which estimates data is requested. - column_name : string - The name of the column for which the AdjustedArray is being - computed. - column : BoundColumn - The column for which the AdjustedArray is being computed. - mask : np.array - Mask array of dimensions len(dates) X len(assets). - assets : pd.Int64Index - An index of all the assets from the raw data. - - Returns - ------- - adjusted_array : AdjustedArray - The array of data and overwrites for the given column. - """ - col_to_adjustments = defaultdict(dict) - # We no longer need this in the index, but we do need it as a column - # to calculate adjustments. - zero_qtr_data = zero_qtr_data.reset_index(NORMALIZED_QUARTERS) - - for sid_idx, sid in enumerate(assets): - zero_qtr_sid_data = zero_qtr_data[ - zero_qtr_data.index.get_level_values(SID_FIELD_NAME) == sid - ] - # Determine where quarters are changing for this sid. - qtr_shifts = zero_qtr_sid_data[ - zero_qtr_sid_data[NORMALIZED_QUARTERS] != - zero_qtr_sid_data[NORMALIZED_QUARTERS].shift(1) - ] - # On dates where we don't have any information about quarters, - # we will get nulls, and each of these will be interpreted as - # quarter shifts. We need to remove these here. - qtr_shifts = qtr_shifts[ - qtr_shifts[NORMALIZED_QUARTERS].notnull() - ] - # For the given sid, determine which quarters we have estimates - # for. - qtrs_with_estimates_for_sid = last_per_qtr.xs( - sid, axis=1, level=SID_FIELD_NAME - ).groupby(axis=1, level=1).first().columns.values - for row_indexer in list(qtr_shifts.index): - # Find the starting index of the quarter that comes right - # after this row. This isn't the starting index of the - # requested quarter, but simply the date we cross over into a - # new quarter. - next_qtr_start_idx = dates.searchsorted( - zero_qtr_data.loc[ - row_indexer - ][EVENT_DATE_FIELD_NAME], - side='left' - if isinstance(self, PreviousQuartersEstimatesLoader) - else 'right' - ) - self.create_overwrite_for_quarter( - col_to_adjustments, - next_qtr_start_idx, - dates, - last_per_qtr, - qtrs_with_estimates_for_sid, - requested_qtr_data, - sid, - sid_idx, - columns, - ) - return col_to_adjustments - - def create_overwrite_for_quarter(self, - col_to_adjustments, - next_qtr_start_idx, - dates, - last_per_qtr, - quarters_with_estimates_for_sid, - requested_qtr_data, - sid, - sid_idx, - columns): - overwrites_dict = {} - for col in columns: - if col.dtype == datetime64ns_dtype: - overwrites_dict[col] = Datetime641DArrayOverwrite - else: - overwrites_dict[col] = Float641DArrayOverwrite - # Only add adjustments if the next quarter starts somewhere in - # our date index for this sid. Our 'next' quarter can never - # start at index 0; a starting index of 0 means that the next - # quarter's event date was NaT. - if 0 < next_qtr_start_idx < len(dates): - # Find the quarter being requested in the quarter we're - # crossing into. - requested_quarter = requested_qtr_data[ - SHIFTED_NORMALIZED_QTRS - ][sid].iloc[next_qtr_start_idx] - for col in columns: - column_name = self.name_map[col.name] - # If there are estimates for the requested quarter, - # overwrite all values going up to the starting index of - # that quarter with estimates for that quarter. - if requested_quarter in quarters_with_estimates_for_sid: - col_to_adjustments[column_name][next_qtr_start_idx] = \ - self.create_overwrite_for_estimate( - col, - column_name, - last_per_qtr, - next_qtr_start_idx, - overwrites_dict[col], - requested_quarter, - sid, - sid_idx - ) - # There are no estimates for the quarter. Overwrite all - # values going up to the starting index of that quarter - # with the missing value for this column. - else: - col_to_adjustments[column_name][next_qtr_start_idx] =\ - self.overwrite_with_null( - col, - last_per_qtr, - next_qtr_start_idx, - overwrites_dict[col], - sid_idx - ) - - def overwrite_with_null(self, - column, - last_per_qtr, - next_qtr_start_idx, - overwrite, - sid_idx): - return [overwrite( - 0, - next_qtr_start_idx - 1, - sid_idx, - sid_idx, - np.full( - len( - last_per_qtr.index[:next_qtr_start_idx] - ), - column.missing_value, - dtype=column.dtype - ))] - - def load_adjusted_array(self, columns, dates, assets, mask): - # Separate out getting the columns' datasets and the datasets' - # num_quarters attributes to ensure that we're catching the right - # AttributeError. - col_to_datasets = {col: col.dataset for col in columns} - try: - groups = groupby(lambda col: col_to_datasets[col].num_quarters, - col_to_datasets) - except AttributeError: - raise AttributeError("Datasets loaded via the " - "QuarterEstimatesLoader must define a " - "`num_quarters` attribute that defines how " - "many quarters out the loader should load " - "the data relative to `dates`.") - if any(num_qtr < 0 for num_qtr in groups): - raise ValueError( - INVALID_NUM_QTRS_MESSAGE % ','.join( - str(qtr) for qtr in groups if qtr < 0 - ) - - ) - out = {} - for num_quarters, columns in groups.items(): - # Determine the last piece of information we know for each column - # on each date in the index for each sid and quarter. - last_per_qtr = last_in_date_group( - self.estimates, dates, assets, reindex=True, - extra_groupers=[NORMALIZED_QUARTERS] - ) - - # Forward fill values for each quarter/sid/dataset column. - ffill_across_cols(last_per_qtr, columns, self.name_map) - # Stack quarter and sid into the index. - stacked_last_per_qtr = last_per_qtr.stack([SID_FIELD_NAME, - NORMALIZED_QUARTERS]) - # Set date index name for ease of reference - stacked_last_per_qtr.index.set_names(SIMULTATION_DATES, - level=0, - inplace=True) - # We want to know the most recent/next event relative to each date. - stacked_last_per_qtr = stacked_last_per_qtr.sort( - EVENT_DATE_FIELD_NAME - ) - # Determine which quarter is next/previous for each date. - shifted_qtr_data = self.load_quarters(num_quarters, - stacked_last_per_qtr) - zero_qtr_idx = shifted_qtr_data.index - requested_qtr_idx = shifted_qtr_data.set_index([ - shifted_qtr_data.index.get_level_values( - SIMULTATION_DATES - ), - shifted_qtr_data.index.get_level_values( - SID_FIELD_NAME - ), - shifted_qtr_data[SHIFTED_NORMALIZED_QTRS] - ]).index - requested_qtr_data = self.get_requested_data_for_col( - stacked_last_per_qtr, requested_qtr_idx, dates - ) - - zero_qtr_data = stacked_last_per_qtr.loc[zero_qtr_idx] - - col_to_adjustments = self.get_adjustments(zero_qtr_data, - requested_qtr_data, - last_per_qtr, - dates, - assets, - columns) - for col in columns: - column_name = self.name_map[col.name] - # We may have dropped assets if they never have any data for the - # requested quarter. - df = pd.DataFrame(data=requested_qtr_data[column_name], - index=dates, - columns=assets, - dtype=col.dtype) - - out[col] = AdjustedArray( - df.values.astype(col.dtype), - mask, - dict(col_to_adjustments[column_name]), - col.missing_value, - ) - return out - - -class NextQuartersEstimatesLoader(QuarterEstimatesLoader): - def create_overwrite_for_estimate(self, - column, - column_name, - last_per_qtr, - next_qtr_start_idx, - overwrite, - requested_quarter, - sid, - sid_idx): - return [overwrite( - 0, - # overwrite thru last qtr - next_qtr_start_idx - 1, - sid_idx, - sid_idx, - last_per_qtr[ - column_name, - requested_quarter, - sid - ][0:next_qtr_start_idx].values)] - - def load_quarters(self, num_quarters, stacked_last_per_qtr): - """ - Filters for releases that are on or after each simulation date and - determines the next quarter by picking out the upcoming release for - each date in the index. Adda a SHIFTED_NORMALIZED_QTRS column which - contains the requested next quarter for each calendar date and sid. - - Parameters - ---------- - num_quarters : int - Number of quarters to go out in the future. - stacked_last_per_qtr : pd.DataFrame - A DataFrame with index of calendar dates, sid, and normalized - quarters with each row being the latest estimate for the row's - index values, sorted by event date. - - Returns - ------- - next_releases_per_date : pd.DataFrame - A DataFrame with index of calendar dates, sid, and normalized - quarters, keeping only rows with next event information relative to - the index values and with an added column for - SHIFTED_NORMALIZED_QTRS, which contains the requested quarter for - each row. - """ - - # We reset the index here because in pandas3, a groupby on the index - # will set the index to just the items in the groupby, so we will lose - # the normalized quarters. - next_releases_per_date = stacked_last_per_qtr.loc[ - stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] >= - stacked_last_per_qtr.index.get_level_values(SIMULTATION_DATES) - ].reset_index(NORMALIZED_QUARTERS).groupby( - level=[SIMULTATION_DATES, SID_FIELD_NAME] - ).nth(0).set_index(NORMALIZED_QUARTERS, append=True) - next_releases_per_date[ - SHIFTED_NORMALIZED_QTRS - ] = next_releases_per_date.index.get_level_values( - NORMALIZED_QUARTERS - ) + (num_quarters - 1) - return next_releases_per_date - - -class PreviousQuartersEstimatesLoader(QuarterEstimatesLoader): - def create_overwrite_for_estimate(self, - column, - column_name, - last_per_qtr, - next_qtr_start_idx, - overwrite, - requested_quarter, - sid, - sid_idx): - return self.overwrite_with_null(column, - last_per_qtr, - next_qtr_start_idx, - overwrite, - sid_idx) - - def load_quarters(self, num_quarters, stacked_last_per_qtr): - """ - Filters for releases that are on or after each simulation date and - determines the previous quarter by picking out the most recent - release relative to each date in the index. Adds a - SHIFTED_NORMALIZED_QTRS column which contains the requested previous - quarter for each calendar date and sid. - - Parameters - ---------- - num_quarters : int - Number of quarters to go out in the past. - stacked_last_per_qtr : pd.DataFrame - A DataFrame with index of calendar dates, sid, and normalized - quarters with each row being the latest estimate for the row's - index values, sorted by event date. - - Returns - ------- - next_releases_per_date : pd.DataFrame - A DataFrame with index of calendar dates, sid, and normalized - quarters, keeping only rows with have a previous event relative - to the index values and with an added column for - SHIFTED_NORMALIZED_QTRS, which contains the requested quarter for - each row. - """ - - # We reset the index here because in pandas3, a groupby on the index - # will set the index to just the items in the groupby, so we will lose - # the normalized quarters. - previous_releases_per_date = stacked_last_per_qtr.loc[ - stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] <= - stacked_last_per_qtr.index.get_level_values(SIMULTATION_DATES) - ].reset_index(NORMALIZED_QUARTERS).groupby( - level=[SIMULTATION_DATES, SID_FIELD_NAME] - ).nth(-1).set_index(NORMALIZED_QUARTERS, append=True) - previous_releases_per_date[ - SHIFTED_NORMALIZED_QTRS - ] = previous_releases_per_date.index.get_level_values( - NORMALIZED_QUARTERS - ) - (num_quarters - 1) - return previous_releases_per_date diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index 125dacfb..a81df8fe 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -320,8 +320,11 @@ def last_in_date_group(df, dates, assets, reindex=True, have_sids=True, ).last() # For the number of things that we're grouping by (except TS), unstack - # the df - last_in_group = last_in_group.unstack(list(range(-1, -len(idx), -1))) + # the df. Done this way because of an unresolved pandas bug whereby + # passing a list of levels with mixed dtypes to unstack causes the + # resulting DataFrame to have all object-type columns. + for _ in range(len(idx) - 1): + last_in_group = last_in_group.unstack(-1) if reindex: if have_sids: diff --git a/zipline/testing/fixtures.py b/zipline/testing/fixtures.py index 1ce01627..f0e2aaa5 100644 --- a/zipline/testing/fixtures.py +++ b/zipline/testing/fixtures.py @@ -34,14 +34,13 @@ from ..finance.trading import TradingEnvironment from ..utils import factory from ..utils.classproperty import classproperty from ..utils.final import FinalMeta, final -from .core import (tmp_asset_finder, make_simple_equity_info) +from .core import tmp_asset_finder, make_simple_equity_info from zipline.assets import Equity, Future from zipline.pipeline import SimplePipelineEngine from zipline.pipeline.loaders.testing import make_seeded_random_loader from zipline.utils.calendars import ( get_calendar, - register_calendar -) + register_calendar) class ZiplineTestCase(with_metaclass(FinalMeta, TestCase)): From bdce6ae4935bac02d0dd4e125c3677134621d4b4 Mon Sep 17 00:00:00 2001 From: Joe Jevnik Date: Mon, 19 Sep 2016 21:13:28 -0400 Subject: [PATCH 14/16] PERF: vectorize earnings estimates --- .../pipeline/loaders/earnings_estimates.py | 275 ++++++++++-------- zipline/pipeline/loaders/utils.py | 6 +- 2 files changed, 156 insertions(+), 125 deletions(-) diff --git a/zipline/pipeline/loaders/earnings_estimates.py b/zipline/pipeline/loaders/earnings_estimates.py index 3397d105..6790e2c4 100644 --- a/zipline/pipeline/loaders/earnings_estimates.py +++ b/zipline/pipeline/loaders/earnings_estimates.py @@ -1,5 +1,7 @@ -from collections import defaultdict from abc import abstractmethod, abstractproperty +from collections import defaultdict + +import numpy as np import pandas as pd from six import viewvalues from toolz import groupby @@ -136,11 +138,15 @@ class EarningsEstimatesLoader(PipelineLoader): self.estimates[FISCAL_YEAR_FIELD_NAME], self.estimates[FISCAL_QUARTER_FIELD_NAME], ) - self.array_overwrites_dict = {datetime64ns_dtype: - Datetime641DArrayOverwrite, - float64_dtype: Float641DArrayOverwrite} - self.scalar_overwrites_dict = {datetime64ns_dtype: Datetime64Overwrite, - float64_dtype: Float64Overwrite} + + self.array_overwrites_dict = { + datetime64ns_dtype: Datetime641DArrayOverwrite, + float64_dtype: Float641DArrayOverwrite, + } + self.scalar_overwrites_dict = { + datetime64ns_dtype: Datetime64Overwrite, + float64_dtype: Float64Overwrite, + } self.name_map = name_map @@ -167,18 +173,25 @@ class EarningsEstimatesLoader(PipelineLoader): def searchsorted_side(self): return NotImplementedError('searchsorted_side') - def get_requested_quarter_data(self, stacked_last_per_qtr, idx, dates): + def get_requested_quarter_data(self, + zero_qtr_data, + zeroth_quarter_idx, + stacked_last_per_qtr, + num_quarters, + dates): """ Selects the requested data for each date. Parameters ---------- + zero_qtr_data : pd.DataFrame + The 'time zero' data for each calendar date per sid. + zeroth_quarter_idx : pd.Index + An index of calendar dates, sid, and normalized quarters, for only + the rows that have a next or previous earnings estimate. stacked_last_per_qtr : pd.DataFrame - The latest estimate known with the dates, normalized quarter, and + The latest estimate known with the dates, normalized quarter, and sid as the index. - idx : pd.MultiIndex - The index of the row of the requested quarter from each date for - each sid. dates : pd.DatetimeIndex The calendar dates for which estimates data is requested. @@ -189,13 +202,27 @@ class EarningsEstimatesLoader(PipelineLoader): for all columns; `dates` are the index and columns are a MultiIndex with sids at the top level and the dataset columns on the bottom. """ - requested_qtr_data = stacked_last_per_qtr.loc[idx] - # We've lost the index names when doing `loc`, so set them here. - requested_qtr_data.index = requested_qtr_data.index.set_names( - idx.names + zero_qtr_data_idx = zero_qtr_data.index + requested_qtr_idx = pd.MultiIndex.from_arrays( + [ + zero_qtr_data_idx.get_level_values(0), + zero_qtr_data_idx.get_level_values(1), + self.get_shifted_qtrs( + zeroth_quarter_idx.get_level_values( + NORMALIZED_QUARTERS, + ), + num_quarters, + ), + ], + names=[ + zero_qtr_data_idx.names[0], + zero_qtr_data_idx.names[1], + SHIFTED_NORMALIZED_QTRS, + ], ) + requested_qtr_data = stacked_last_per_qtr.loc[requested_qtr_idx] requested_qtr_data = requested_qtr_data.reset_index( - SHIFTED_NORMALIZED_QTRS + SHIFTED_NORMALIZED_QTRS, ) # Calculate the actual year/quarter being requested and add those in # as columns. @@ -244,54 +271,44 @@ class EarningsEstimatesLoader(PipelineLoader): col_to_overwrites = defaultdict(dict) # We no longer need NORMALIZED_QUARTERS in the index, but we do need it # as a column to calculate adjustments. - zero_qtr_data = zero_qtr_data.reset_index(NORMALIZED_QUARTERS) + zero_qtr_data = zero_qtr_data.reset_index(level=NORMALIZED_QUARTERS) + zero_qtr_data.sort_index(inplace=True) - for sid_idx, sid in enumerate(assets): - zero_qtr_sid_data = zero_qtr_data[ - zero_qtr_data.index.get_level_values(SID_FIELD_NAME) == sid + quarter_shifts = zero_qtr_data.loc[ + zero_qtr_data.index[ + zero_qtr_data.groupby(level=SID_FIELD_NAME)[ + NORMALIZED_QUARTERS + ].diff().nonzero() ] - # Determine where quarters are changing for this sid. - qtr_shifts = zero_qtr_sid_data[ - zero_qtr_sid_data[NORMALIZED_QUARTERS] != - zero_qtr_sid_data[NORMALIZED_QUARTERS].shift(1) - ] - # On dates where we don't have any information about quarters, - # we will get nulls, and each of these will be interpreted as - # quarter shifts. We need to remove these here. - qtr_shifts = qtr_shifts[ - qtr_shifts[NORMALIZED_QUARTERS].notnull() - ] - # For the given sid, determine which quarters we have estimates - # for. - qtrs_with_estimates_for_sid = last_per_qtr.xs( - sid, axis=1, level=SID_FIELD_NAME - ).groupby(axis=1, level=1).first().columns.values - for row_indexer in list(qtr_shifts.index): - # Find the starting index of the quarter that comes right - # after this row. This isn't the starting index of the - # requested quarter, but simply the date we cross over into a - # new quarter. - next_qtr_start_idx = dates.searchsorted( - zero_qtr_data.loc[ - row_indexer - ][EVENT_DATE_FIELD_NAME], - side=self.searchsorted_side - ) - # Only add adjustments if the next quarter starts somewhere in - # our date index for this sid. Our 'next' quarter can never - # start at index 0; a starting index of 0 means that the next - # quarter's event date was NaT. - if 0 < next_qtr_start_idx < len(dates): + ] + + sid_to_idx = dict(zip(assets, range(len(assets)))) + + def collect_adjustments(group): + next_qtr_start_indices = dates.searchsorted( + group[EVENT_DATE_FIELD_NAME].values, + side=self.searchsorted_side, + ) + sid = int(group.name) + qtrs_with_estimates = group[NORMALIZED_QUARTERS].values + for idx in next_qtr_start_indices: + if 0 < idx < len(dates): + # Only add adjustments if the next quarter starts somewhere + # in our date index for this sid. Our 'next' quarter can + # never start at index 0; a starting index of 0 means that + # the next quarter's event date was NaT. self.create_overwrite_for_quarter( col_to_overwrites, - next_qtr_start_idx, + idx, last_per_qtr, - qtrs_with_estimates_for_sid, + qtrs_with_estimates, requested_qtr_data, sid, - sid_idx, + sid_to_idx[sid], columns, ) + + quarter_shifts.groupby(level=SID_FIELD_NAME).apply(collect_adjustments) return col_to_overwrites def create_overwrite_for_quarter(self, @@ -335,16 +352,16 @@ class EarningsEstimatesLoader(PipelineLoader): # Find the quarter being requested in the quarter we're # crossing into. requested_quarter = requested_qtr_data[ - SHIFTED_NORMALIZED_QTRS - ][sid].iloc[next_qtr_start_idx] + SHIFTED_NORMALIZED_QTRS, sid, + ].iloc[next_qtr_start_idx] for col in columns: column_name = self.name_map[col.name] # If there are estimates for the requested quarter, # overwrite all values going up to the starting index of # that quarter with estimates for that quarter. if requested_quarter in quarters_with_estimates_for_sid: - col_to_overwrites[column_name][next_qtr_start_idx] = \ - [self.create_overwrite_for_estimate( + col_to_overwrites[column_name][next_qtr_start_idx] = [ + self.create_overwrite_for_estimate( col, column_name, last_per_qtr, @@ -352,18 +369,20 @@ class EarningsEstimatesLoader(PipelineLoader): requested_quarter, sid, sid_idx - )] + ), + ] # There are no estimates for the quarter. Overwrite all # values going up to the starting index of that quarter # with the missing value for this column. else: - col_to_overwrites[column_name][next_qtr_start_idx] =\ - [self.overwrite_with_null( + col_to_overwrites[column_name][next_qtr_start_idx] = [ + self.overwrite_with_null( col, last_per_qtr.index, next_qtr_start_idx, sid_idx - )] + ), + ] def overwrite_with_null(self, column, @@ -403,34 +422,23 @@ class EarningsEstimatesLoader(PipelineLoader): # To optimize performance, only work below on assets that are # actually in the raw data. assets_with_data = set(assets) & set(self.estimates[SID_FIELD_NAME]) + last_per_qtr, stacked_last_per_qtr = self.get_last_data_per_qtr( + assets_with_data, + columns, + dates + ) + # Determine which quarter is immediately next/previous for each + # date. + zeroth_quarter_idx = self.get_zeroth_quarter_idx(stacked_last_per_qtr) + zero_qtr_data = stacked_last_per_qtr.loc[zeroth_quarter_idx] + for num_quarters, columns in groups.items(): - last_per_qtr, stacked_last_per_qtr = self.get_last_data_per_qtr( - assets_with_data, columns, dates - ) - # Determine which quarter is immediately next/previous for each - # date. - zeroth_quarter_idx = self.get_zeroth_quarter_idx( - num_quarters, stacked_last_per_qtr - ) - zero_qtr_data = stacked_last_per_qtr.loc[zeroth_quarter_idx] - # Doing it this way because creating a MultiIndex from scratch - # results in being unable to unstack sids because of duplicate - # values, even though the MultiIndex is created with the same - # exact values as below - possible pandas bug. - requested_qtr_idx = zero_qtr_data.reset_index( - NORMALIZED_QUARTERS - ).set_index( - self.get_shifted_qtrs( - zeroth_quarter_idx.get_level_values(NORMALIZED_QUARTERS), - num_quarters - ), - append=True - ).index - requested_qtr_idx = requested_qtr_idx.rename( - SHIFTED_NORMALIZED_QTRS, -1 - ) requested_qtr_data = self.get_requested_quarter_data( - stacked_last_per_qtr, requested_qtr_idx, dates + zero_qtr_data, + zeroth_quarter_idx, + stacked_last_per_qtr, + num_quarters, + dates, ) # Calculate all adjustments for the given quarter and accumulate @@ -441,17 +449,33 @@ class EarningsEstimatesLoader(PipelineLoader): dates, assets_with_data, columns) + + # Lookup the asset indexer once, this is so we can reindex + # the assets returned into the assets requested for each column. + # This depends on the fact that our column multiindex has the same + # sids for each field. This allows us to do the lookup once on + # level 1 instead of doing the lookup each time per value in + # level 0. + asset_indexer = assets.get_indexer_for( + requested_qtr_data.columns.levels[1], + ) for col in columns: column_name = self.name_map[col.name] - # We may have dropped assets if they never have any data for - # the requested quarter. - df = pd.DataFrame(data=requested_qtr_data[column_name], - index=dates, - columns=assets, - dtype=col.dtype) + # allocate the empty output with the correct missing value + output_array = np.full( + (len(dates), len(assets)), + col.missing_value, + dtype=col.dtype, + ) + # overwrite the missing value with values from the computed + # data + output_array[ + :, + asset_indexer, + ] = requested_qtr_data[column_name].values out[col] = AdjustedArray( - df.values.astype(col.dtype), + output_array, mask, dict(col_to_adjustments[column_name]), col.missing_value, @@ -487,20 +511,26 @@ class EarningsEstimatesLoader(PipelineLoader): # self.estimates.columns, normalized_quarters, sid], where each cell # contains the latest data for that day. last_per_qtr = last_in_date_group( - self.estimates, dates, assets_with_data, reindex=True, - extra_groupers=[NORMALIZED_QUARTERS] + self.estimates, + dates, + assets_with_data, + reindex=True, + extra_groupers=[NORMALIZED_QUARTERS], ) # Forward fill values for each quarter/sid/dataset column. ffill_across_cols(last_per_qtr, columns, self.name_map) # Stack quarter and sid into the index. - stacked_last_per_qtr = last_per_qtr.stack([SID_FIELD_NAME, - NORMALIZED_QUARTERS]) + stacked_last_per_qtr = last_per_qtr.stack( + [SID_FIELD_NAME, NORMALIZED_QUARTERS], + ) # Set date index name for ease of reference - stacked_last_per_qtr.index.set_names(SIMULTATION_DATES, - level=0, - inplace=True) + stacked_last_per_qtr.index.set_names( + SIMULTATION_DATES, + level=0, + inplace=True, + ) stacked_last_per_qtr = stacked_last_per_qtr.sort( - EVENT_DATE_FIELD_NAME + EVENT_DATE_FIELD_NAME, ) stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] = pd.to_datetime( stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] @@ -509,9 +539,7 @@ class EarningsEstimatesLoader(PipelineLoader): class NextEarningsEstimatesLoader(EarningsEstimatesLoader): - @property - def searchsorted_side(self): - return 'right' + searchsorted_side = 'right' def create_overwrite_for_estimate(self, column, @@ -530,13 +558,14 @@ class NextEarningsEstimatesLoader(EarningsEstimatesLoader): last_per_qtr[ column_name, requested_quarter, - sid - ][:next_qtr_start_idx].values) + sid, + ].values[:next_qtr_start_idx], + ) def get_shifted_qtrs(self, zero_qtrs, num_quarters): return zero_qtrs + (num_quarters - 1) - def get_zeroth_quarter_idx(self, num_quarters, stacked_last_per_qtr): + def get_zeroth_quarter_idx(self, stacked_last_per_qtr): """ Filters for releases that are on or after each simulation date and determines the next quarter by picking out the upcoming release for @@ -544,8 +573,6 @@ class NextEarningsEstimatesLoader(EarningsEstimatesLoader): Parameters ---------- - num_quarters : int - Number of quarters to go out in the future. stacked_last_per_qtr : pd.DataFrame A DataFrame with index of calendar dates, sid, and normalized quarters with each row being the latest estimate for the row's @@ -562,15 +589,14 @@ class NextEarningsEstimatesLoader(EarningsEstimatesLoader): stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] >= stacked_last_per_qtr.index.get_level_values(SIMULTATION_DATES) ].groupby( - level=[SIMULTATION_DATES, SID_FIELD_NAME], as_index=False + level=[SIMULTATION_DATES, SID_FIELD_NAME], + as_index=False, ).nth(0) return next_releases_per_date.index class PreviousEarningsEstimatesLoader(EarningsEstimatesLoader): - @property - def searchsorted_side(self): - return 'left' + searchsorted_side = 'left' def create_overwrite_for_estimate(self, column, @@ -580,15 +606,17 @@ class PreviousEarningsEstimatesLoader(EarningsEstimatesLoader): requested_quarter, sid, sid_idx): - return self.overwrite_with_null(column, - dates, - next_qtr_start_idx, - sid_idx) + return self.overwrite_with_null( + column, + dates, + next_qtr_start_idx, + sid_idx, + ) def get_shifted_qtrs(self, zero_qtrs, num_quarters): return zero_qtrs - (num_quarters - 1) - def get_zeroth_quarter_idx(self, num_quarters, stacked_last_per_qtr): + def get_zeroth_quarter_idx(self, stacked_last_per_qtr): """ Filters for releases that are on or after each simulation date and determines the previous quarter by picking out the most recent @@ -596,8 +624,6 @@ class PreviousEarningsEstimatesLoader(EarningsEstimatesLoader): Parameters ---------- - num_quarters : int - Number of quarters to go out in the past. stacked_last_per_qtr : pd.DataFrame A DataFrame with index of calendar dates, sid, and normalized quarters with each row being the latest estimate for the row's @@ -614,6 +640,7 @@ class PreviousEarningsEstimatesLoader(EarningsEstimatesLoader): stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] <= stacked_last_per_qtr.index.get_level_values(SIMULTATION_DATES) ].groupby( - level=[SIMULTATION_DATES, SID_FIELD_NAME], as_index=False + level=[SIMULTATION_DATES, SID_FIELD_NAME], + as_index=False, ).nth(-1) return previous_releases_per_date.index diff --git a/zipline/pipeline/loaders/utils.py b/zipline/pipeline/loaders/utils.py index a81df8fe..028da74c 100644 --- a/zipline/pipeline/loaders/utils.py +++ b/zipline/pipeline/loaders/utils.py @@ -276,7 +276,11 @@ def check_data_query_args(data_query_time, data_query_tz): ) -def last_in_date_group(df, dates, assets, reindex=True, have_sids=True, +def last_in_date_group(df, + dates, + assets, + reindex=True, + have_sids=True, extra_groupers=[]): """ Determine the last piece of information known on each date in the date From 8d2e6446ee0886766e5bb861ca2ddc5112c17bd8 Mon Sep 17 00:00:00 2001 From: Joe Jevnik Date: Tue, 20 Sep 2016 13:44:26 -0400 Subject: [PATCH 15/16] PERF: only query for the columns requested + metadata BUG: choose last event date for quarter shift --- tests/pipeline/test_quarters_estimates.py | 8 +-- zipline/pipeline/loaders/blaze/estimates.py | 20 ++++--- zipline/pipeline/loaders/blaze/events.py | 2 +- .../pipeline/loaders/earnings_estimates.py | 52 +++++++++---------- 4 files changed, 43 insertions(+), 39 deletions(-) diff --git a/tests/pipeline/test_quarters_estimates.py b/tests/pipeline/test_quarters_estimates.py index 4f7b8366..8301201d 100644 --- a/tests/pipeline/test_quarters_estimates.py +++ b/tests/pipeline/test_quarters_estimates.py @@ -746,7 +746,7 @@ class WithEstimateWindows(WithEstimates): columns=[SID_FIELD_NAME, 'estimate', 'knowledge_date']) - df = df.pivot_table(columns='sid', + df = df.pivot_table(columns=SID_FIELD_NAME, values='estimate', index='knowledge_date') df = df.reindex( @@ -796,8 +796,8 @@ class WithEstimateWindows(WithEstimates): engine.run_pipeline( Pipeline({'est': SomeFactor()}), start_date=start_idx, - end_date=pd.Timestamp('2015-01-20', tz='utc'), # last event date - # we have + # last event date we have + end_date=pd.Timestamp('2015-01-20', tz='utc'), ) @@ -938,7 +938,7 @@ class QuarterShiftTestCase(ZiplineTestCase): backwards/forwards from a starting point. """ def test_quarter_normalization(self): - input_yrs = pd.Series([0] * 4, dtype=np.int64) + input_yrs = pd.Series(range(2011, 2015), dtype=np.int64) input_qtrs = pd.Series(range(1, 5), dtype=np.int64) result_years, result_quarters = split_normalized_quarters( normalize_quarters(input_yrs, input_qtrs) diff --git a/zipline/pipeline/loaders/blaze/estimates.py b/zipline/pipeline/loaders/blaze/estimates.py index b63c58a6..58a2fba0 100644 --- a/zipline/pipeline/loaders/blaze/estimates.py +++ b/zipline/pipeline/loaders/blaze/estimates.py @@ -16,6 +16,7 @@ from zipline.pipeline.loaders.earnings_estimates import ( NextEarningsEstimatesLoader, PreviousEarningsEstimatesLoader, required_estimates_fields, + metadata_columns, ) from zipline.pipeline.loaders.utils import ( check_data_query_args, @@ -104,17 +105,20 @@ class BlazeEstimatesLoader(PipelineLoader): self._checkpoints = checkpoints def load_adjusted_array(self, columns, dates, assets, mask): - raw = load_raw_data(assets, - dates, - self._data_query_time, - self._data_query_tz, - self._expr, - self._odo_kwargs, - checkpoints=self._checkpoints) + column_names = [column.name for column in columns] + raw = load_raw_data( + assets, + dates, + self._data_query_time, + self._data_query_tz, + self._expr[sorted(metadata_columns.union(column_names))], + self._odo_kwargs, + checkpoints=self._checkpoints, + ) return self.loader( raw, - self._columns, + {k: self._columns[k] for k in column_names} ).load_adjusted_array( columns, dates, diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index 6ecf3c6f..0a6e56f3 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -57,7 +57,7 @@ class BlazeEventsLoader(PipelineLoader): start the backtest with knowledge of all announcements. """ - __doc__ == __doc__.format(SID_FIELD_NAME=SID_FIELD_NAME, + __doc__ = __doc__.format(SID_FIELD_NAME=SID_FIELD_NAME, TS_FIELD_NAME=TS_FIELD_NAME, EVENT_DATE_FIELD_NAME=EVENT_DATE_FIELD_NAME) diff --git a/zipline/pipeline/loaders/earnings_estimates.py b/zipline/pipeline/loaders/earnings_estimates.py index 6790e2c4..c7a141a8 100644 --- a/zipline/pipeline/loaders/earnings_estimates.py +++ b/zipline/pipeline/loaders/earnings_estimates.py @@ -50,23 +50,24 @@ def split_normalized_quarters(normalized_quarters): return years, quarters + 1 +# These metadata columns are used to align event indexers. +metadata_columns = frozenset({ + TS_FIELD_NAME, + SID_FIELD_NAME, + EVENT_DATE_FIELD_NAME, + FISCAL_QUARTER_FIELD_NAME, + FISCAL_YEAR_FIELD_NAME, +}) + + def required_estimates_fields(columns): """ Compute the set of resource columns required to serve `columns`. """ - # These metadata columns are used to align event indexers. - return { - TS_FIELD_NAME, - SID_FIELD_NAME, - EVENT_DATE_FIELD_NAME, - FISCAL_QUARTER_FIELD_NAME, - FISCAL_YEAR_FIELD_NAME - }.union( - # We also expect any of the field names that our loadable columns - # are mapped to. - viewvalues(columns), - ) + # We also expect any of the field names that our loadable columns + # are mapped to. + return metadata_columns.union(viewvalues(columns)) def validate_column_specs(events, columns): @@ -269,18 +270,13 @@ class EarningsEstimatesLoader(PipelineLoader): The array of data and overwrites for the given column. """ col_to_overwrites = defaultdict(dict) - # We no longer need NORMALIZED_QUARTERS in the index, but we do need it - # as a column to calculate adjustments. - zero_qtr_data = zero_qtr_data.reset_index(level=NORMALIZED_QUARTERS) zero_qtr_data.sort_index(inplace=True) - - quarter_shifts = zero_qtr_data.loc[ - zero_qtr_data.index[ - zero_qtr_data.groupby(level=SID_FIELD_NAME)[ - NORMALIZED_QUARTERS - ].diff().nonzero() - ] - ] + # Here we want to get the LAST record from each group of records + # corresponding to a single quarter. This is to ensure that we select + # the most up-to-date event date in case the event date changes. + quarter_shifts = zero_qtr_data.groupby( + level=[SID_FIELD_NAME, NORMALIZED_QUARTERS] + ).nth(-1) sid_to_idx = dict(zip(assets, range(len(assets)))) @@ -290,7 +286,9 @@ class EarningsEstimatesLoader(PipelineLoader): side=self.searchsorted_side, ) sid = int(group.name) - qtrs_with_estimates = group[NORMALIZED_QUARTERS].values + qtrs_with_estimates = group.index.get_level_values( + NORMALIZED_QUARTERS + ).values for idx in next_qtr_start_indices: if 0 < idx < len(dates): # Only add adjustments if the next quarter starts somewhere @@ -584,13 +582,14 @@ class NextEarningsEstimatesLoader(EarningsEstimatesLoader): An index of calendar dates, sid, and normalized quarters, for only the rows that have a next event. """ - next_releases_per_date = stacked_last_per_qtr.loc[ stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] >= stacked_last_per_qtr.index.get_level_values(SIMULTATION_DATES) ].groupby( level=[SIMULTATION_DATES, SID_FIELD_NAME], as_index=False, + # Here we take advantage of the fact that `stacked_last_per_qtr` is + # sorted by event date. ).nth(0) return next_releases_per_date.index @@ -635,12 +634,13 @@ class PreviousEarningsEstimatesLoader(EarningsEstimatesLoader): An index of calendar dates, sid, and normalized quarters, for only the rows that have a previous event. """ - previous_releases_per_date = stacked_last_per_qtr.loc[ stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] <= stacked_last_per_qtr.index.get_level_values(SIMULTATION_DATES) ].groupby( level=[SIMULTATION_DATES, SID_FIELD_NAME], as_index=False, + # Here we take advantage of the fact that `stacked_last_per_qtr` is + # sorted by event date. ).nth(-1) return previous_releases_per_date.index From f528c01ca9ac396135f7a89e58e3c046de1f352d Mon Sep 17 00:00:00 2001 From: Maya Tydykov Date: Wed, 21 Sep 2016 15:35:21 -0400 Subject: [PATCH 16/16] TST: add test for changing event dates and adjustments BUG: get column names from column dict BUG: fix name map --- tests/pipeline/test_quarters_estimates.py | 136 ++++++++++++++++-- zipline/pipeline/loaders/blaze/core.py | 2 +- zipline/pipeline/loaders/blaze/estimates.py | 8 +- zipline/pipeline/loaders/blaze/events.py | 4 +- .../pipeline/loaders/earnings_estimates.py | 45 +++--- zipline/testing/core.py | 6 - 6 files changed, 156 insertions(+), 45 deletions(-) diff --git a/tests/pipeline/test_quarters_estimates.py b/tests/pipeline/test_quarters_estimates.py index 8301201d..8f0cf777 100644 --- a/tests/pipeline/test_quarters_estimates.py +++ b/tests/pipeline/test_quarters_estimates.py @@ -3,6 +3,7 @@ import itertools from nose.tools import assert_true from nose_parameterized import parameterized import numpy as np +from numpy.testing import assert_array_equal import pandas as pd from toolz import merge @@ -44,9 +45,9 @@ class Estimates(DataSet): estimate = Column(dtype=float64_dtype) -def QuartersEstimates(num_qtr): +def QuartersEstimates(announcements_out): class QtrEstimates(Estimates): - num_quarters = num_qtr + num_announcements = announcements_out name = Estimates return QtrEstimates @@ -123,11 +124,11 @@ class WithWrongLoaderDefinition(WithEstimates): Tests ------ - test_wrong_num_quarters_passed() + test_wrong_num_announcements_passed() Tests that loading with an incorrect quarter number raises an error. - test_no_num_quarters_attr() + test_no_num_announcements_attr() Tests that the loader throws an AssertionError if the dataset being - loaded has no `num_quarters` attribute. + loaded has no `num_announcements` attribute. """ @classmethod @@ -141,7 +142,7 @@ class WithWrongLoaderDefinition(WithEstimates): 'estimate'], index=[0]) - def test_wrong_num_quarters_passed(self): + def test_wrong_num_announcements_passed(self): bad_dataset1 = QuartersEstimates(-1) bad_dataset2 = QuartersEstimates(-2) good_dataset = QuartersEstimates(1) @@ -150,7 +151,7 @@ class WithWrongLoaderDefinition(WithEstimates): self.trading_days, self.asset_finder, ) - columns = {c.name + str(dataset.num_quarters): c.latest + columns = {c.name + str(dataset.num_announcements): c.latest for dataset in (bad_dataset1, bad_dataset2, good_dataset) @@ -165,7 +166,7 @@ class WithWrongLoaderDefinition(WithEstimates): ) assert_raises_regex(e, INVALID_NUM_QTRS_MESSAGE % "-1,-2") - def test_no_num_quarters_attr(self): + def test_no_num_announcements_attr(self): dataset = QuartersEstimatesNoNumQuartersAttr(1) engine = SimplePipelineEngine( lambda x: self.loader, @@ -657,6 +658,119 @@ class PreviousEstimateMultipleQuarters( return expected +class WithVaryingNumEstimates(WithEstimates): + """ + ZiplineTestCase mixin providing fixtures and a test to ensure that we + have the correct overwrites when the event date changes. We want to make + sure that if we have a quarter with an event date that gets pushed back, + we don't start overwriting for the next quarter early. Likewise, + if we have a quarter with an event date that gets pushed forward, we want + to make sure that we start applying adjustments at the appropriate, earlier + date, rather than the later date. + + Methods + ------- + assert_compute() + Defines how to determine that results computed for the `SomeFactor` + factor are correct. + + Tests + ----- + test_windows_with_varying_num_estimates() + Tests that we create the correct overwrites from 2015-01-13 to + 2015-01-14 regardless of how event dates were updated for each + quarter for each sid. + """ + + @classmethod + def make_events(cls): + return pd.DataFrame({ + SID_FIELD_NAME: [0] * 3 + [1] * 3, + TS_FIELD_NAME: [pd.Timestamp('2015-01-09'), + pd.Timestamp('2015-01-12'), + pd.Timestamp('2015-01-13')] * 2, + EVENT_DATE_FIELD_NAME: [pd.Timestamp('2015-01-12'), + pd.Timestamp('2015-01-13'), + pd.Timestamp('2015-01-20'), + pd.Timestamp('2015-01-13'), + pd.Timestamp('2015-01-12'), + pd.Timestamp('2015-01-20')], + 'estimate': [11., 12., 21.] * 2, + FISCAL_QUARTER_FIELD_NAME: [1, 1, 2] * 2, + FISCAL_YEAR_FIELD_NAME: [2015] * 6 + }) + + @classmethod + def assert_compute(cls, estimate, today): + raise NotImplementedError('assert_compute') + + def test_windows_with_varying_num_estimates(self): + dataset = QuartersEstimates(1) + assert_compute = self.assert_compute + + class SomeFactor(CustomFactor): + inputs = [dataset.estimate] + window_length = 3 + + def compute(self, today, assets, out, estimate): + assert_compute(estimate, today) + + engine = SimplePipelineEngine( + lambda x: self.loader, + self.trading_days, + self.asset_finder, + ) + engine.run_pipeline( + Pipeline({'est': SomeFactor()}), + start_date=pd.Timestamp('2015-01-13', tz='utc'), + # last event date we have + end_date=pd.Timestamp('2015-01-14', tz='utc'), + ) + + +class PreviousVaryingNumEstimates( + WithVaryingNumEstimates, + ZiplineTestCase +): + def assert_compute(self, estimate, today): + if today == pd.Timestamp('2015-01-13', tz='utc'): + assert_array_equal(estimate[:, 0], + np.array([np.NaN, np.NaN, 12])) + assert_array_equal(estimate[:, 1], + np.array([np.NaN, 12, 12])) + else: + assert_array_equal(estimate[:, 0], + np.array([np.NaN, 12, 12])) + assert_array_equal(estimate[:, 1], + np.array([12, 12, 12])) + + @classmethod + def make_loader(cls, events, columns): + return PreviousEarningsEstimatesLoader(events, columns) + + +class NextVaryingNumEstimates( + WithVaryingNumEstimates, + ZiplineTestCase +): + + def assert_compute(self, estimate, today): + if today == pd.Timestamp('2015-01-13', tz='utc'): + assert_array_equal(estimate[:, 0], + np.array([11, 12, 12])) + assert_array_equal(estimate[:, 1], + np.array([np.NaN, np.NaN, 21])) + else: + assert_array_equal(estimate[:, 0], + np.array([np.NaN, 21, 21])) + assert_array_equal(estimate[:, 1], + np.array([np.NaN, 21, 21])) + + @classmethod + def make_loader(cls, events, columns): + return NextEarningsEstimatesLoader(events, columns) + + class WithEstimateWindows(WithEstimates): """ ZiplineTestCase mixin providing fixures and a test to test running a @@ -761,8 +875,8 @@ class WithEstimateWindows(WithEstimates): @parameterized.expand(window_test_cases) def test_estimate_windows_at_quarter_boundaries(self, start_idx, - num_quarters_out): - dataset = QuartersEstimates(num_quarters_out) + num_announcements_out): + dataset = QuartersEstimates(num_announcements_out) trading_days = self.trading_days timelines = self.timelines # The window length should be from the starting index back to the first @@ -781,7 +895,7 @@ class WithEstimateWindows(WithEstimates): def compute(self, today, assets, out, estimate): today_idx = trading_days.get_loc(today) today_timeline = timelines[ - num_quarters_out + num_announcements_out ].loc[today].reindex( trading_days[:today_idx + 1] ).values diff --git a/zipline/pipeline/loaders/blaze/core.py b/zipline/pipeline/loaders/blaze/core.py index d5c3e40b..925faddf 100644 --- a/zipline/pipeline/loaders/blaze/core.py +++ b/zipline/pipeline/loaders/blaze/core.py @@ -188,7 +188,7 @@ from zipline.utils.input_validation import ( ensure_timezone, optionally, ) -from zipline.utils.numpy_utils import bool_dtype, categorical_dtype +from zipline.utils.numpy_utils import bool_dtype from zipline.utils.pool import SequentialPool from zipline.utils.preprocess import preprocess diff --git a/zipline/pipeline/loaders/blaze/estimates.py b/zipline/pipeline/loaders/blaze/estimates.py index 58a2fba0..3cb1064a 100644 --- a/zipline/pipeline/loaders/blaze/estimates.py +++ b/zipline/pipeline/loaders/blaze/estimates.py @@ -105,20 +105,22 @@ class BlazeEstimatesLoader(PipelineLoader): self._checkpoints = checkpoints def load_adjusted_array(self, columns, dates, assets, mask): - column_names = [column.name for column in columns] + # Only load requested columns. + requested_column_names = [self._columns[column.name] + for column in columns] raw = load_raw_data( assets, dates, self._data_query_time, self._data_query_tz, - self._expr[sorted(metadata_columns.union(column_names))], + self._expr[sorted(metadata_columns.union(requested_column_names))], self._odo_kwargs, checkpoints=self._checkpoints, ) return self.loader( raw, - {k: self._columns[k] for k in column_names} + {column.name: self._columns[column.name] for column in columns} ).load_adjusted_array( columns, dates, diff --git a/zipline/pipeline/loaders/blaze/events.py b/zipline/pipeline/loaders/blaze/events.py index 0a6e56f3..cc778d5c 100644 --- a/zipline/pipeline/loaders/blaze/events.py +++ b/zipline/pipeline/loaders/blaze/events.py @@ -58,8 +58,8 @@ class BlazeEventsLoader(PipelineLoader): """ __doc__ = __doc__.format(SID_FIELD_NAME=SID_FIELD_NAME, - TS_FIELD_NAME=TS_FIELD_NAME, - EVENT_DATE_FIELD_NAME=EVENT_DATE_FIELD_NAME) + TS_FIELD_NAME=TS_FIELD_NAME, + EVENT_DATE_FIELD_NAME=EVENT_DATE_FIELD_NAME) @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, diff --git a/zipline/pipeline/loaders/earnings_estimates.py b/zipline/pipeline/loaders/earnings_estimates.py index c7a141a8..5a794683 100644 --- a/zipline/pipeline/loaders/earnings_estimates.py +++ b/zipline/pipeline/loaders/earnings_estimates.py @@ -37,7 +37,7 @@ NORMALIZED_QUARTERS = 'normalized_quarters' PREVIOUS_FISCAL_QUARTER = 'previous_fiscal_quarter' PREVIOUS_FISCAL_YEAR = 'previous_fiscal_year' SHIFTED_NORMALIZED_QTRS = 'shifted_normalized_quarters' -SIMULTATION_DATES = 'dates' +SIMULATION_DATES = 'dates' def normalize_quarters(years, quarters): @@ -95,7 +95,7 @@ class EarningsEstimatesLoader(PipelineLoader): """ An abstract pipeline loader for estimates data that can load data a variable number of quarters forwards/backwards from calendar dates - depending on the `num_quarters` attribute of the columns' dataset. + depending on the `num_announcements` attribute of the columns' dataset. Parameters ---------- @@ -152,11 +152,11 @@ class EarningsEstimatesLoader(PipelineLoader): self.name_map = name_map @abstractmethod - def get_zeroth_quarter_idx(self, num_quarters, last, dates): + def get_zeroth_quarter_idx(self, num_announcements, last, dates): raise NotImplementedError('get_zeroth_quarter_idx') @abstractmethod - def get_shifted_qtrs(self, zero_qtrs, num_quarters): + def get_shifted_qtrs(self, zero_qtrs, num_announcements): raise NotImplementedError('get_shifted_qtrs') @abstractmethod @@ -178,7 +178,7 @@ class EarningsEstimatesLoader(PipelineLoader): zero_qtr_data, zeroth_quarter_idx, stacked_last_per_qtr, - num_quarters, + num_announcements, dates): """ Selects the requested data for each date. @@ -212,7 +212,7 @@ class EarningsEstimatesLoader(PipelineLoader): zeroth_quarter_idx.get_level_values( NORMALIZED_QUARTERS, ), - num_quarters, + num_announcements, ), ], names=[ @@ -397,18 +397,19 @@ class EarningsEstimatesLoader(PipelineLoader): def load_adjusted_array(self, columns, dates, assets, mask): # Separate out getting the columns' datasets and the datasets' - # num_quarters attributes to ensure that we're catching the right + # num_announcements attributes to ensure that we're catching the right # AttributeError. col_to_datasets = {col: col.dataset for col in columns} try: - groups = groupby(lambda col: col_to_datasets[col].num_quarters, + groups = groupby(lambda col: + col_to_datasets[col].num_announcements, col_to_datasets) except AttributeError: raise AttributeError("Datasets loaded via the " "EarningsEstimatesLoader must define a " - "`num_quarters` attribute that defines how " - "many quarters out the loader should load " - "the data relative to `dates`.") + "`num_announcements` attribute that defines " + "how many quarters out the loader should load" + " the data relative to `dates`.") if any(num_qtr < 0 for num_qtr in groups): raise ValueError( INVALID_NUM_QTRS_MESSAGE % ','.join( @@ -430,12 +431,12 @@ class EarningsEstimatesLoader(PipelineLoader): zeroth_quarter_idx = self.get_zeroth_quarter_idx(stacked_last_per_qtr) zero_qtr_data = stacked_last_per_qtr.loc[zeroth_quarter_idx] - for num_quarters, columns in groups.items(): + for num_announcements, columns in groups.items(): requested_qtr_data = self.get_requested_quarter_data( zero_qtr_data, zeroth_quarter_idx, stacked_last_per_qtr, - num_quarters, + num_announcements, dates, ) @@ -523,7 +524,7 @@ class EarningsEstimatesLoader(PipelineLoader): ) # Set date index name for ease of reference stacked_last_per_qtr.index.set_names( - SIMULTATION_DATES, + SIMULATION_DATES, level=0, inplace=True, ) @@ -560,8 +561,8 @@ class NextEarningsEstimatesLoader(EarningsEstimatesLoader): ].values[:next_qtr_start_idx], ) - def get_shifted_qtrs(self, zero_qtrs, num_quarters): - return zero_qtrs + (num_quarters - 1) + def get_shifted_qtrs(self, zero_qtrs, num_announcements): + return zero_qtrs + (num_announcements - 1) def get_zeroth_quarter_idx(self, stacked_last_per_qtr): """ @@ -584,9 +585,9 @@ class NextEarningsEstimatesLoader(EarningsEstimatesLoader): """ next_releases_per_date = stacked_last_per_qtr.loc[ stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] >= - stacked_last_per_qtr.index.get_level_values(SIMULTATION_DATES) + stacked_last_per_qtr.index.get_level_values(SIMULATION_DATES) ].groupby( - level=[SIMULTATION_DATES, SID_FIELD_NAME], + level=[SIMULATION_DATES, SID_FIELD_NAME], as_index=False, # Here we take advantage of the fact that `stacked_last_per_qtr` is # sorted by event date. @@ -612,8 +613,8 @@ class PreviousEarningsEstimatesLoader(EarningsEstimatesLoader): sid_idx, ) - def get_shifted_qtrs(self, zero_qtrs, num_quarters): - return zero_qtrs - (num_quarters - 1) + def get_shifted_qtrs(self, zero_qtrs, num_announcements): + return zero_qtrs - (num_announcements - 1) def get_zeroth_quarter_idx(self, stacked_last_per_qtr): """ @@ -636,9 +637,9 @@ class PreviousEarningsEstimatesLoader(EarningsEstimatesLoader): """ previous_releases_per_date = stacked_last_per_qtr.loc[ stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] <= - stacked_last_per_qtr.index.get_level_values(SIMULTATION_DATES) + stacked_last_per_qtr.index.get_level_values(SIMULATION_DATES) ].groupby( - level=[SIMULTATION_DATES, SID_FIELD_NAME], + level=[SIMULATION_DATES, SID_FIELD_NAME], as_index=False, # Here we take advantage of the fact that `stacked_last_per_qtr` is # sorted by event date. diff --git a/zipline/testing/core.py b/zipline/testing/core.py index 19240a5b..f9832767 100644 --- a/zipline/testing/core.py +++ b/zipline/testing/core.py @@ -49,14 +49,8 @@ from zipline.pipeline.loaders.testing import make_seeded_random_loader from zipline.utils import security_list from zipline.utils.calendars import get_calendar from zipline.utils.input_validation import expect_dimensions -<<<<<<< HEAD from zipline.utils.numpy_utils import as_column, isnat from zipline.utils.pandas_utils import timedelta_to_integral_seconds -======= -from zipline.utils.numpy_utils import ( - as_column, -) ->>>>>>> WIP from zipline.utils.sentinel import sentinel import numpy as np