From bd0f1380816b8ed507d69d1cf34f8446edecc3e0 Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Thu, 5 May 2016 14:09:02 -0400 Subject: [PATCH] TEST/MAINT: Refactor unique axis verification. Break it into a standalone function that handles any pandas type. --- tests/test_panel_daily_bar_reader.py | 44 +++++++++++++++------------- zipline/data/us_equity_pricing.py | 20 +++++-------- zipline/utils/input_validation.py | 37 +++++++++++++++++++++++ 3 files changed, 68 insertions(+), 33 deletions(-) diff --git a/tests/test_panel_daily_bar_reader.py b/tests/test_panel_daily_bar_reader.py index be7b7c15..178a4553 100644 --- a/tests/test_panel_daily_bar_reader.py +++ b/tests/test_panel_daily_bar_reader.py @@ -13,34 +13,36 @@ # See the License for the specific language governing permissions and # limitations under the License. +from itertools import permutations + import pandas as pd from zipline.data.us_equity_pricing import PanelDailyBarReader -from zipline.testing.fixtures import WithTradingEnvironment, ZiplineTestCase +from zipline.testing import ExplodingObject +from zipline.testing.fixtures import ZiplineTestCase -class TestPanelDailyBarReader(WithTradingEnvironment, ZiplineTestCase): +class TestPanelDailyBarReader(ZiplineTestCase): def test_duplicate_values(self): - df = pd.DataFrame() - panel = pd.concat([pd.Panel({"X": df}), pd.Panel({"X": df})]) + UNIMPORTANT_VALUE = 57 - with self.assertRaises(ValueError) as e: - # panel's items has duplicates - PanelDailyBarReader(None, panel) + panel = pd.Panel( + UNIMPORTANT_VALUE, + items=['a', 'b', 'b', 'a'], + major_axis=['c'], + minor_axis=['d'], + ) + unused = ExplodingObject() - self.assertEqual("Duplicated items found: ['X']", - e.exception.message) + axis_names = ['items', 'major_axis', 'minor_axis'] - with self.assertRaises(ValueError) as e: - # panel's major axis has duplicates - PanelDailyBarReader(None, panel.swapaxes(0, 1)) + for axis_order in permutations((0, 1, 2)): + with self.assertRaises(ValueError) as e: + PanelDailyBarReader(unused, panel.transpose(*axis_order)) - self.assertEqual("Duplicated items found: ['X']", - e.exception.message) - - with self.assertRaises(ValueError) as e: - # panel's minor axis has duplicates - PanelDailyBarReader(None, panel.swapaxes(0, 2)) - - self.assertEqual("Duplicated items found: ['X']", - e.exception.message) + expected = ( + "Duplicate entries in Panel.{name}: ['a', 'b'].".format( + name=axis_names[axis_order.index(0)], + ) + ) + self.assertEqual(str(e.exception), expected) diff --git a/zipline/data/us_equity_pricing.py b/zipline/data/us_equity_pricing.py index 7637c001..197b74ca 100644 --- a/zipline/data/us_equity_pricing.py +++ b/zipline/data/us_equity_pricing.py @@ -54,10 +54,12 @@ from six import ( ) from zipline.utils.functional import apply +from zipline.utils.preprocess import call from zipline.utils.input_validation import ( coerce_string, preprocess, expect_element, + verify_indices_all_unique, ) from zipline.utils.sqlite_utils import group_into_chunks from zipline.utils.memoize import lazyval @@ -696,9 +698,12 @@ class PanelDailyBarReader(DailyBarReader): DataPanel Structure ------- - items : Int64Index, asset identifiers - major_axis : DatetimeIndex, days provided by the Panel. + items : Int64Index + Asset identifiers. Must be unique. + major_axis : DatetimeIndex + Dates for data provided provided by the Panel. Must be unique. minor_axis : ['open', 'high', 'low', 'close', 'volume'] + Price attributes. Must be unique. Attributes ---------- @@ -710,17 +715,8 @@ class PanelDailyBarReader(DailyBarReader): first_trading_day : pd.Timestamp The first trading day in the dataset. """ + @preprocess(panel=call(verify_indices_all_unique)) def __init__(self, calendar, panel): - # check duplicates on all indices of panel - - for attr_name in ["items", "major_axis", "minor_axis"]: - index = getattr(panel, attr_name) - duplicates = index.duplicated() - - if duplicates.any(): - raise ValueError("Duplicated items found: {0}".format( - index[duplicates].values - )) panel = panel.copy() if 'volume' not in panel.items: diff --git a/zipline/utils/input_validation.py b/zipline/utils/input_validation.py index 66e8b9fe..5b2c84c0 100644 --- a/zipline/utils/input_validation.py +++ b/zipline/utils/input_validation.py @@ -25,6 +25,43 @@ import toolz.curried.operator as op from zipline.utils.preprocess import preprocess +def verify_indices_all_unique(obj): + """ + Check that all axes of a pandas object are unique. + + Parameters + ---------- + obj : pd.Series / pd.DataFrame / pd.Panel + The object to validate. + + Returns + ------- + None + + Raises + ------ + ValueError + If any axis has duplicate entries. + """ + axis_names = [ + ('index',), # Series + ('index', 'columns'), # DataFrame + ('items', 'major_axis', 'minor_axis') # Panel + ][obj.ndim - 1] # ndim = 1 should go to entry 0, + + for axis_name, index in zip(axis_names, obj.axes): + if index.is_unique: + continue + + raise ValueError( + "Duplicate entries in {type}.{axis}: {dupes}.".format( + type=type(obj).__name__, + axis=axis_name, + dupes=sorted(index[index.duplicated()]), + ) + ) + + def optionally(preprocessor): """Modify a preprocessor to explicitly allow `None`.