[DataFrame] Implement rank (#1991)

* rank method completed * added sanity checks * flake8 * updated sanity checks * flake8 * updated sanity checks and style * updated dtype logic * Fixing test
2026-06-30 22:20:31 +08:00 · 2018-05-06 09:32:33 -07:00
parent 857458c37c
commit 9f28529e2c
2 changed files with 56 additions and 8 deletions
@@ -3166,9 +3166,52 @@ class DataFrame(object):

    def rank(self, axis=0, method='average', numeric_only=None,
             na_option='keep', ascending=True, pct=False):
-        raise NotImplementedError(
-            "To contribute to Pandas on Ray, please visit "
-            "github.com/ray-project/ray.")
+
+        """
+        Compute numerical data ranks (1 through n) along axis.
+        Equal values are assigned a rank that is the [method] of
+        the ranks of those values.
+
+        Args:
+            axis (int): 0 or 'index' for row-wise,
+                        1 or 'columns' for column-wise
+            interpolation: {‘average’, ‘min’, ‘max’, ‘first’, ‘dense’}
+                Specifies which method to use for equal vals
+            numeric_only (boolean)
+                Include only float, int, boolean data.
+            na_option: {'keep', 'top', 'bottom'}
+                Specifies how to handle NA options
+            ascending (boolean):
+                Decedes ranking order
+            pct (boolean):
+                Computes percentage ranking of data
+        Returns:
+            A new DataFrame
+        """
+
+        def rank_helper(df):
+            return df.rank(axis=axis, method=method,
+                           numeric_only=numeric_only,
+                           na_option=na_option,
+                           ascending=ascending, pct=pct)
+
+        axis = pd.DataFrame()._get_axis_number(axis)
+
+        if (axis == 1):
+            new_cols = self.dtypes[self.dtypes.apply(
+                                   lambda x: is_numeric_dtype(x))].index
+            result = _map_partitions(rank_helper,
+                                     self._row_partitions)
+            return DataFrame(row_partitions=result,
+                             columns=new_cols,
+                             index=self.index)
+
+        if (axis == 0):
+            result = _map_partitions(rank_helper,
+                                     self._col_partitions)
+            return DataFrame(col_partitions=result,
+                             columns=self.columns,
+                             index=self.index)

    def rdiv(self, other, axis='columns', level=None, fill_value=None):
        return self._single_df_op_helper(
@@ -229,6 +229,7 @@ def test_int_dataframe():
    test_quantile(ray_df, pandas_df, .75)
    test_describe(ray_df, pandas_df)
    test_diff(ray_df, pandas_df)
+    test_rank(ray_df, pandas_df)

    test_all(ray_df, pandas_df)
    test_any(ray_df, pandas_df)
@@ -396,6 +397,7 @@ def test_float_dataframe():
    test_quantile(ray_df, pandas_df, .75)
    test_describe(ray_df, pandas_df)
    test_diff(ray_df, pandas_df)
+    test_rank(ray_df, pandas_df)

    test_all(ray_df, pandas_df)
    test_any(ray_df, pandas_df)
@@ -564,6 +566,9 @@ def test_mixed_dtype_dataframe():
    test_quantile(ray_df, pandas_df, .75)
    test_describe(ray_df, pandas_df)

+    # TODO Reolve once Pandas-20962 is resolved.
+    # test_rank(ray_df, pandas_df)
+
    test_all(ray_df, pandas_df)
    test_any(ray_df, pandas_df)
    test___getitem__(ray_df, pandas_df)
@@ -722,6 +727,7 @@ def test_nan_dataframe():
    test_quantile(ray_df, pandas_df, .75)
    test_describe(ray_df, pandas_df)
    test_diff(ray_df, pandas_df)
+    test_rank(ray_df, pandas_df)

    test_all(ray_df, pandas_df)
    test_any(ray_df, pandas_df)
@@ -2377,11 +2383,10 @@ def test_radd():
    test_inter_df_math_right_ops("radd")


-def test_rank():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.rank()
+@pytest.fixture
+def test_rank(ray_df, pandas_df):
+    assert(ray_df_equals_pandas(ray_df.rank(), pandas_df.rank()))
+    assert(ray_df_equals_pandas(ray_df.rank(axis=1), pandas_df.rank(axis=1)))


 def test_rdiv():