[DataFrame] Implement rank (#1991)

* rank method completed

* added sanity checks

* flake8

* updated sanity checks

* flake8

* updated sanity checks and style

* updated dtype logic

* Fixing test
This commit is contained in:
Rohan Singh
2018-05-06 09:32:33 -07:00
committed by Devin Petersohn
parent 857458c37c
commit 9f28529e2c
2 changed files with 56 additions and 8 deletions
+46 -3
View File
@@ -3166,9 +3166,52 @@ class DataFrame(object):
def rank(self, axis=0, method='average', numeric_only=None,
na_option='keep', ascending=True, pct=False):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
"""
Compute numerical data ranks (1 through n) along axis.
Equal values are assigned a rank that is the [method] of
the ranks of those values.
Args:
axis (int): 0 or 'index' for row-wise,
1 or 'columns' for column-wise
interpolation: {average, min, max, first, dense}
Specifies which method to use for equal vals
numeric_only (boolean)
Include only float, int, boolean data.
na_option: {'keep', 'top', 'bottom'}
Specifies how to handle NA options
ascending (boolean):
Decedes ranking order
pct (boolean):
Computes percentage ranking of data
Returns:
A new DataFrame
"""
def rank_helper(df):
return df.rank(axis=axis, method=method,
numeric_only=numeric_only,
na_option=na_option,
ascending=ascending, pct=pct)
axis = pd.DataFrame()._get_axis_number(axis)
if (axis == 1):
new_cols = self.dtypes[self.dtypes.apply(
lambda x: is_numeric_dtype(x))].index
result = _map_partitions(rank_helper,
self._row_partitions)
return DataFrame(row_partitions=result,
columns=new_cols,
index=self.index)
if (axis == 0):
result = _map_partitions(rank_helper,
self._col_partitions)
return DataFrame(col_partitions=result,
columns=self.columns,
index=self.index)
def rdiv(self, other, axis='columns', level=None, fill_value=None):
return self._single_df_op_helper(
+10 -5
View File
@@ -229,6 +229,7 @@ def test_int_dataframe():
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_diff(ray_df, pandas_df)
test_rank(ray_df, pandas_df)
test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
@@ -396,6 +397,7 @@ def test_float_dataframe():
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_diff(ray_df, pandas_df)
test_rank(ray_df, pandas_df)
test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
@@ -564,6 +566,9 @@ def test_mixed_dtype_dataframe():
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
# TODO Reolve once Pandas-20962 is resolved.
# test_rank(ray_df, pandas_df)
test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
test___getitem__(ray_df, pandas_df)
@@ -722,6 +727,7 @@ def test_nan_dataframe():
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_diff(ray_df, pandas_df)
test_rank(ray_df, pandas_df)
test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
@@ -2377,11 +2383,10 @@ def test_radd():
test_inter_df_math_right_ops("radd")
def test_rank():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.rank()
@pytest.fixture
def test_rank(ray_df, pandas_df):
assert(ray_df_equals_pandas(ray_df.rank(), pandas_df.rank()))
assert(ray_df_equals_pandas(ray_df.rank(axis=1), pandas_df.rank(axis=1)))
def test_rdiv():