[Dataframes] Implemented .describe() (#1696)

* added describe methods * mean updates and added truediv func * updates * updated truediv test * porting stocks to ubuntu * hacky solution for describe, mean, median, quantile by transposing df * removed data file * removed faulty truediv implementation * flake8 and documentation updates * updated mean, median, var, std to handle mixed values * added describe methods * mean updates and added truediv func * updates * updated truediv test * porting stocks to ubuntu * hacky solution for describe, mean, median, quantile by transposing df * removed data file * removed faulty truediv implementation * flake8 and documentation updates * fixed quantile to drop object typed columns * syntax improvements" * fixed flatten issue * fixing flatten issue * minor updates * added describe methods * mean updates and added truediv func * updates * updated truediv test * porting stocks to ubuntu * hacky solution for describe, mean, median, quantile by transposing df * removed data file * removed faulty truediv implementation * flake8 and documentation updates * updated mean, median, var, std to handle mixed values * added describe methods * mean updates and added truediv func * updates * updated truediv test * porting stocks to ubuntu * hacky solution for describe, mean, median, quantile by transposing df * removed data file * removed faulty truediv implementation * flake8 and documentation updates * fixed quantile to drop object typed columns * syntax improvements" * fixed flatten issue * fixing flatten issue * improved describe syntax
2026-06-29 21:08:50 +08:00 · 2018-03-15 21:16:59 -07:00
parent 459fd5e152
commit 1f027344f1
2 changed files with 261 additions and 48 deletions
@@ -822,9 +822,54 @@ class DataFrame(object):
            return final_df

    def describe(self, percentiles=None, include=None, exclude=None):
-        raise NotImplementedError(
-            "To contribute to Pandas on Ray, please visit "
-            "github.com/ray-project/ray.")
+        """
+        Generates descriptive statistics that summarize the central tendency,
+        dispersion and shape of a dataset’s distribution, excluding NaN values.
+
+        Args:
+            percentiles (list-like of numbers, optional):
+                The percentiles to include in the output.
+            include: White-list of data types to include in results
+            exclude: Black-list of data types to exclude in results
+
+        Returns: Series/DataFrame of summary statistics
+        """
+
+        obj_columns = [self.columns[i]
+                       for i, t in enumerate(self.dtypes)
+                       if t == np.dtype('O')]
+
+        rdf = self.drop(columns=obj_columns)
+
+        transposed = rdf.T
+
+        count_df = rdf.count()
+        mean_df = transposed.mean(axis=1)
+        std_df = transposed.std(axis=1)
+        min_df = to_pandas(rdf.min())
+
+        if percentiles is None:
+            percentiles = [.25, .50, .75]
+
+        percentiles_dfs = [transposed.quantile(q, axis=1)
+                           for q in percentiles]
+
+        max_df = to_pandas(rdf.max())
+
+        describe_df = pd.DataFrame()
+        describe_df['count'] = count_df
+        describe_df['mean'] = mean_df
+        describe_df['std'] = std_df
+        describe_df['min'] = min_df
+
+        for i in range(len(percentiles)):
+            percentile_str = "{0:.0f}%".format(percentiles[i]*100)
+
+            describe_df[percentile_str] = percentiles_dfs[i]
+
+        describe_df['max'] = max_df
+
+        return describe_df.T

    def diff(self, periods=1, axis=0):
        raise NotImplementedError(
@@ -1617,15 +1662,62 @@ class DataFrame(object):

    def mean(self, axis=None, skipna=None, level=None, numeric_only=None,
             **kwargs):
-        raise NotImplementedError(
-            "To contribute to Pandas on Ray, please visit "
-            "github.com/ray-project/ray.")
+        """Computes mean across the DataFrame.
+
+        Args:
+            axis (int): The axis to take the mean on.
+            skipna (bool): True to skip NA values, false otherwise.
+
+        Returns:
+            The mean of the DataFrame. (Pandas series)
+        """
+
+        if axis == 0 or axis is None:
+            return self.T.mean(
+                                axis=1, skipna=skipna,
+                                level=level, numeric_only=numeric_only
+                              )
+        else:
+            func = (lambda df: df.T.mean(axis=0,
+                    skipna=None, level=None, numeric_only=None))
+
+            computed_means = [
+                    _deploy_func.remote(func, part) for part in self._df]
+
+            items = ray.get(computed_means)
+
+            _mean = pd.concat(items)
+
+            return _mean

    def median(self, axis=None, skipna=None, level=None, numeric_only=None,
               **kwargs):
-        raise NotImplementedError(
-            "To contribute to Pandas on Ray, please visit "
-            "github.com/ray-project/ray.")
+        """Computes median across the DataFrame.
+
+        Args:
+            axis (int): The axis to take the median on.
+            skipna (bool): True to skip NA values, false otherwise.
+
+        Returns:
+            The median of the DataFrame. (Pandas series)
+        """
+        if axis == 0 or axis is None:
+            return self.T.median(
+                                axis=1, level=level, numeric_only=numeric_only
+                                )
+        else:
+
+            func = (lambda df: df.T.median(axis=0, level=level,
+                                           numeric_only=numeric_only))
+
+            computed_medians = [
+                    _deploy_func.remote(func, part) for part in self._df]
+
+            items = ray.get(computed_medians)
+
+            _median = pd.concat(items)
+
+            return _median

    def melt(self, id_vars=None, value_vars=None, var_name=None,
             value_name='value', col_level=None):
@@ -1798,9 +1890,58 @@ class DataFrame(object):

    def quantile(self, q=0.5, axis=0, numeric_only=True,
                 interpolation='linear'):
-        raise NotImplementedError(
-            "To contribute to Pandas on Ray, please visit "
-            "github.com/ray-project/ray.")
+        """Return values at the given quantile over requested axis,
+            a la numpy.percentile.
+
+        Args:
+            q (float): 0 <= q <= 1, the quantile(s) to compute
+            axis (int): 0 or ‘index’ for row-wise,
+                        1 or ‘columns’ for column-wise
+            interpolation: {'linear’, ‘lower’, ‘higher’, ‘midpoint’, ‘nearest’}
+                Specifies which interpolation method to use
+
+        Returns:
+            quantiles : Series or DataFrame
+                    If q is an array, a DataFrame will be returned where the
+                    index is q, the columns are the columns of self, and the
+                    values are the quantiles.
+
+                    If q is a float, a Series will be returned where the
+                    index is the columns of self and the values
+                    are the quantiles.
+        """
+
+        if (type(q) is list):
+            return DataFrame([self.quantile(q_i, axis=axis,
+                                            numeric_only=numeric_only,
+                                            interpolation=interpolation)
+                              for q_i in q], q, self.index)
+
+        # this section can be replaced with select_dtypes()
+
+        obj_columns = [self.columns[i]
+                       for i, t in enumerate(self.dtypes)
+                       if t == np.dtype('O')]
+
+        rdf = self.drop(columns=obj_columns)
+
+        if axis == 0 or axis is None:
+            return rdf.T.quantile(q, axis=1, numeric_only=numeric_only,
+                                  interpolation=interpolation)
+        else:
+            computed_quantiles = [
+                _deploy_func.remote(
+                        lambda df: df.quantile(q, axis=1,
+                                               numeric_only=numeric_only,
+                                               interpolation=interpolation
+                                               ), part)
+                for part in self._df]
+
+            items = ray.get(computed_quantiles)
+
+            _quantile = pd.concat(items)
+
+            return _quantile

    def query(self, expr, inplace=False, **kwargs):
        """Queries the Dataframe with a boolean expression
@@ -2273,9 +2414,34 @@ class DataFrame(object):

    def std(self, axis=None, skipna=None, level=None, ddof=1,
            numeric_only=None, **kwargs):
-        raise NotImplementedError(
-            "To contribute to Pandas on Ray, please visit "
-            "github.com/ray-project/ray.")
+        """Computes standard deviation across the DataFrame.
+
+        Args:
+            axis (int): The axis to take the std on.
+            skipna (bool): True to skip NA values, false otherwise.
+            ddof (int): degrees of freedom
+
+        Returns:
+            The std of the DataFrame (Pandas Series)
+        """
+        if axis == 0 or axis is None:
+            return self.T.std(
+                        axis=1, skipna=skipna, level=level,
+                        ddof=ddof, numeric_only=numeric_only)
+        else:
+
+            computed_stds = [_deploy_func.remote(
+                                        lambda df: df.T.std(
+                                            axis=0, skipna=skipna, level=level,
+                                            ddof=ddof,
+                                            numeric_only=numeric_only), part)
+                             for part in self._df]
+
+            items = ray.get(computed_stds)
+
+            _stds = pd.concat(items)
+
+            return _stds

    def sub(self, other, axis='columns', level=None, fill_value=None):
        raise NotImplementedError(
@@ -2529,9 +2695,32 @@ class DataFrame(object):

    def var(self, axis=None, skipna=None, level=None, ddof=1,
            numeric_only=None, **kwargs):
-        raise NotImplementedError(
-            "To contribute to Pandas on Ray, please visit "
-            "github.com/ray-project/ray.")
+        """Computes variance across the DataFrame.
+
+        Args:
+            axis (int): The axis to take the variance on.
+            skipna (bool): True to skip NA values, false otherwise.
+            ddof (int): degrees of freedom
+
+        Returns:
+            The variance of the DataFrame.
+        """
+        if axis == 0 or axis is None:
+            return self.T.var(axis=1, skipna=skipna, level=level, ddof=ddof,
+                              numeric_only=numeric_only)
+        else:
+            computed_vars = [_deploy_func.remote(lambda df: df.T.var(
+                                            axis=0, skipna=skipna, level=level,
+                                            ddof=ddof,
+                                            numeric_only=numeric_only),
+                                          part)
+                             for part in self._df]
+
+            items = ray.get(computed_vars)
+
+            _var = pd.concat(items)
+
+            return _var

    def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
              errors='raise', try_cast=False, raise_on_error=None):
@@ -202,6 +202,15 @@ def test_int_dataframe():
    test_round(ray_df, pandas_df)
    test_query(ray_df, pandas_df, query_funcs)

+    test_mean(ray_df, pandas_df)
+    test_var(ray_df, pandas_df)
+    test_std(ray_df, pandas_df)
+    test_median(ray_df, pandas_df)
+    test_quantile(ray_df, pandas_df, .25)
+    test_quantile(ray_df, pandas_df, .5)
+    test_quantile(ray_df, pandas_df, .75)
+    test_describe(ray_df, pandas_df)
+
    test_all(ray_df, pandas_df)
    test_any(ray_df, pandas_df)
    test___getitem__(ray_df, pandas_df)
@@ -316,6 +325,15 @@ def test_float_dataframe():
    test_round(ray_df, pandas_df)
    test_query(ray_df, pandas_df, query_funcs)

+    test_mean(ray_df, pandas_df)
+    test_var(ray_df, pandas_df)
+    test_std(ray_df, pandas_df)
+    test_median(ray_df, pandas_df)
+    test_quantile(ray_df, pandas_df, .25)
+    test_quantile(ray_df, pandas_df, .5)
+    test_quantile(ray_df, pandas_df, .75)
+    test_describe(ray_df, pandas_df)
+
    test_all(ray_df, pandas_df)
    test_any(ray_df, pandas_df)
    test___getitem__(ray_df, pandas_df)
@@ -432,6 +450,15 @@ def test_mixed_dtype_dataframe():
    test_round(ray_df, pandas_df)
    test_query(ray_df, pandas_df, query_funcs)

+    test_mean(ray_df, pandas_df)
+    test_var(ray_df, pandas_df)
+    test_std(ray_df, pandas_df)
+    test_median(ray_df, pandas_df)
+    test_quantile(ray_df, pandas_df, .25)
+    test_quantile(ray_df, pandas_df, .5)
+    test_quantile(ray_df, pandas_df, .75)
+    test_describe(ray_df, pandas_df)
+
    test_all(ray_df, pandas_df)
    test_any(ray_df, pandas_df)
    test___getitem__(ray_df, pandas_df)
@@ -549,6 +576,15 @@ def test_nan_dataframe():
    test_round(ray_df, pandas_df)
    test_query(ray_df, pandas_df, query_funcs)

+    test_mean(ray_df, pandas_df)
+    test_var(ray_df, pandas_df)
+    test_std(ray_df, pandas_df)
+    test_median(ray_df, pandas_df)
+    test_quantile(ray_df, pandas_df, .25)
+    test_quantile(ray_df, pandas_df, .5)
+    test_quantile(ray_df, pandas_df, .75)
+    test_describe(ray_df, pandas_df)
+
    test_all(ray_df, pandas_df)
    test_any(ray_df, pandas_df)
    test___getitem__(ray_df, pandas_df)
@@ -860,11 +896,9 @@ def test_cumsum(ray_df, pandas_df):
    assert(ray_df_equals_pandas(ray_df.cumsum(), pandas_df.cumsum()))


-def test_describe():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.describe()
+@pytest.fixture
+def test_describe(ray_df, pandas_df):
+    assert(ray_df.describe().equals(pandas_df.describe()))


 def test_diff():
@@ -1743,18 +1777,14 @@ def test_max(ray_df, pandas_df):
    assert(ray_df_equals_pandas(ray_df.max(), pandas_df.max()))


-def test_mean():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.mean()
+@pytest.fixture
+def test_mean(ray_df, pandas_df):
+    assert(ray_df.mean().equals(pandas_df.mean()))


-def test_median():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.median()
+@pytest.fixture
+def test_median(ray_df, pandas_df):
+    assert(ray_df.median().equals(pandas_df.median()))


 def test_melt():
@@ -1915,11 +1945,9 @@ def test_product():
        ray_df.product()


-def test_quantile():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.quantile()
+@pytest.fixture
+def test_quantile(ray_df, pandas_df, q):
+    assert(ray_df.quantile(q).equals(pandas_df.quantile(q)))


@pytest.fixture
@@ -2424,11 +2452,9 @@ def test_stack():
        ray_df.stack()


-def test_std():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.std()
+@pytest.fixture
+def test_std(ray_df, pandas_df):
+    assert(ray_df.std().equals(pandas_df.std()))


 def test_sub():
@@ -2688,11 +2714,9 @@ def test_update():
        ray_df.update(None)


-def test_var():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.var()
+@pytest.fixture
+def test_var(ray_df, pandas_df):
+    assert(ray_df.var().equals(pandas_df.var()))


 def test_where():