From 9f28529e2cb00b3b73d1fe4085b756f630e91d63 Mon Sep 17 00:00:00 2001 From: Rohan Singh <11rohans@gmail.com> Date: Sun, 6 May 2018 09:32:33 -0700 Subject: [PATCH] [DataFrame] Implement rank (#1991) * rank method completed * added sanity checks * flake8 * updated sanity checks * flake8 * updated sanity checks and style * updated dtype logic * Fixing test --- python/ray/dataframe/dataframe.py | 49 +++++++++++++++++++-- python/ray/dataframe/test/test_dataframe.py | 15 ++++--- 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index b886691d5..bd6558fd3 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -3166,9 +3166,52 @@ class DataFrame(object): def rank(self, axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + + """ + Compute numerical data ranks (1 through n) along axis. + Equal values are assigned a rank that is the [method] of + the ranks of those values. + + Args: + axis (int): 0 or 'index' for row-wise, + 1 or 'columns' for column-wise + interpolation: {‘average’, ‘min’, ‘max’, ‘first’, ‘dense’} + Specifies which method to use for equal vals + numeric_only (boolean) + Include only float, int, boolean data. + na_option: {'keep', 'top', 'bottom'} + Specifies how to handle NA options + ascending (boolean): + Decedes ranking order + pct (boolean): + Computes percentage ranking of data + Returns: + A new DataFrame + """ + + def rank_helper(df): + return df.rank(axis=axis, method=method, + numeric_only=numeric_only, + na_option=na_option, + ascending=ascending, pct=pct) + + axis = pd.DataFrame()._get_axis_number(axis) + + if (axis == 1): + new_cols = self.dtypes[self.dtypes.apply( + lambda x: is_numeric_dtype(x))].index + result = _map_partitions(rank_helper, + self._row_partitions) + return DataFrame(row_partitions=result, + columns=new_cols, + index=self.index) + + if (axis == 0): + result = _map_partitions(rank_helper, + self._col_partitions) + return DataFrame(col_partitions=result, + columns=self.columns, + index=self.index) def rdiv(self, other, axis='columns', level=None, fill_value=None): return self._single_df_op_helper( diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index 43d11b6b4..fa3fb5667 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -229,6 +229,7 @@ def test_int_dataframe(): test_quantile(ray_df, pandas_df, .75) test_describe(ray_df, pandas_df) test_diff(ray_df, pandas_df) + test_rank(ray_df, pandas_df) test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) @@ -396,6 +397,7 @@ def test_float_dataframe(): test_quantile(ray_df, pandas_df, .75) test_describe(ray_df, pandas_df) test_diff(ray_df, pandas_df) + test_rank(ray_df, pandas_df) test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) @@ -564,6 +566,9 @@ def test_mixed_dtype_dataframe(): test_quantile(ray_df, pandas_df, .75) test_describe(ray_df, pandas_df) + # TODO Reolve once Pandas-20962 is resolved. + # test_rank(ray_df, pandas_df) + test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) test___getitem__(ray_df, pandas_df) @@ -722,6 +727,7 @@ def test_nan_dataframe(): test_quantile(ray_df, pandas_df, .75) test_describe(ray_df, pandas_df) test_diff(ray_df, pandas_df) + test_rank(ray_df, pandas_df) test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) @@ -2377,11 +2383,10 @@ def test_radd(): test_inter_df_math_right_ops("radd") -def test_rank(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.rank() +@pytest.fixture +def test_rank(ray_df, pandas_df): + assert(ray_df_equals_pandas(ray_df.rank(), pandas_df.rank())) + assert(ray_df_equals_pandas(ray_df.rank(axis=1), pandas_df.rank(axis=1))) def test_rdiv():