From b79912ec74aeb0e43e289425eff1014b1ca82779 Mon Sep 17 00:00:00 2001 From: Kunal Gosar Date: Thu, 10 May 2018 08:25:24 -0700 Subject: [PATCH] [DataFrame] Fixes dropna subset bug (#2018) * fix dropna * resolve comment --- python/ray/dataframe/dataframe.py | 15 +++++--- python/ray/dataframe/test/test_dataframe.py | 39 +++++++++++++++++++++ 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index b3223682a..d250a13d5 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -806,17 +806,22 @@ class DataFrame(object): if how is None and thresh is None: raise TypeError('must specify how or thresh') + indices = None if subset is not None: - subset = set(subset) - if axis == 1: - subset = [item for item in self.index if item in subset] + indices = self.index.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(list(np.compress(check, subset))) else: - subset = [item for item in self.columns if item in subset] + indices = self.columns.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(list(np.compress(check, subset))) def dropna_helper(df): new_df = df.dropna(axis=axis, how=how, thresh=thresh, - subset=subset, inplace=False) + subset=indices, inplace=False) if axis == 1: new_index = new_df.columns diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index d944c5bb5..6cd72ec4b 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -842,10 +842,15 @@ def test_dense_nan_df(): [np.nan, np.nan, np.nan, 5]], columns=list('ABCD')) + column_subsets = [list('AD'), list('BC'), list('CD')] + row_subsets = [[0, 1], [0, 1, 2], [2, 0]] + test_dropna(ray_df, pd_df) test_dropna_inplace(ray_df, pd_df) test_dropna_multiple_axes(ray_df, pd_df) test_dropna_multiple_axes_inplace(ray_df, pd_df) + test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets) + test_dropna_subset_error(ray_df) @pytest.fixture @@ -1402,6 +1407,40 @@ def test_dropna_multiple_axes_inplace(ray_df, pd_df): assert ray_df_equals_pandas(ray_df_copy, pd_df_copy) +@pytest.fixture +def test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets): + for subset in column_subsets: + assert ray_df_equals_pandas( + ray_df.dropna(how='all', subset=subset), + pd_df.dropna(how='all', subset=subset) + ) + + assert ray_df_equals_pandas( + ray_df.dropna(how='any', subset=subset), + pd_df.dropna(how='any', subset=subset) + ) + + for subset in row_subsets: + assert ray_df_equals_pandas( + ray_df.dropna(how='all', axis=1, subset=subset), + pd_df.dropna(how='all', axis=1, subset=subset) + ) + + assert ray_df_equals_pandas( + ray_df.dropna(how='any', axis=1, subset=subset), + pd_df.dropna(how='any', axis=1, subset=subset) + ) + + +@pytest.fixture +def test_dropna_subset_error(ray_df): + with pytest.raises(KeyError): + ray_df.dropna(subset=list('EF')) + + with pytest.raises(KeyError): + ray_df.dropna(axis=1, subset=[4, 5]) + + def test_duplicated(): ray_df = create_test_dataframe()