diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index bd9f34e43..1e89056be 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -822,9 +822,54 @@ class DataFrame(object): return final_df def describe(self, percentiles=None, include=None, exclude=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """ + Generates descriptive statistics that summarize the central tendency, + dispersion and shape of a dataset’s distribution, excluding NaN values. + + Args: + percentiles (list-like of numbers, optional): + The percentiles to include in the output. + include: White-list of data types to include in results + exclude: Black-list of data types to exclude in results + + Returns: Series/DataFrame of summary statistics + """ + + obj_columns = [self.columns[i] + for i, t in enumerate(self.dtypes) + if t == np.dtype('O')] + + rdf = self.drop(columns=obj_columns) + + transposed = rdf.T + + count_df = rdf.count() + mean_df = transposed.mean(axis=1) + std_df = transposed.std(axis=1) + min_df = to_pandas(rdf.min()) + + if percentiles is None: + percentiles = [.25, .50, .75] + + percentiles_dfs = [transposed.quantile(q, axis=1) + for q in percentiles] + + max_df = to_pandas(rdf.max()) + + describe_df = pd.DataFrame() + describe_df['count'] = count_df + describe_df['mean'] = mean_df + describe_df['std'] = std_df + describe_df['min'] = min_df + + for i in range(len(percentiles)): + percentile_str = "{0:.0f}%".format(percentiles[i]*100) + + describe_df[percentile_str] = percentiles_dfs[i] + + describe_df['max'] = max_df + + return describe_df.T def diff(self, periods=1, axis=0): raise NotImplementedError( @@ -1617,15 +1662,62 @@ class DataFrame(object): def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Computes mean across the DataFrame. + + Args: + axis (int): The axis to take the mean on. + skipna (bool): True to skip NA values, false otherwise. + + Returns: + The mean of the DataFrame. (Pandas series) + """ + + if axis == 0 or axis is None: + return self.T.mean( + axis=1, skipna=skipna, + level=level, numeric_only=numeric_only + ) + else: + func = (lambda df: df.T.mean(axis=0, + skipna=None, level=None, numeric_only=None)) + + computed_means = [ + _deploy_func.remote(func, part) for part in self._df] + + items = ray.get(computed_means) + + _mean = pd.concat(items) + + return _mean def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Computes median across the DataFrame. + + Args: + axis (int): The axis to take the median on. + skipna (bool): True to skip NA values, false otherwise. + + Returns: + The median of the DataFrame. (Pandas series) + """ + if axis == 0 or axis is None: + return self.T.median( + axis=1, level=level, numeric_only=numeric_only + ) + else: + + func = (lambda df: df.T.median(axis=0, level=level, + numeric_only=numeric_only)) + + computed_medians = [ + _deploy_func.remote(func, part) for part in self._df] + + items = ray.get(computed_medians) + + _median = pd.concat(items) + + return _median def melt(self, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None): @@ -1798,9 +1890,58 @@ class DataFrame(object): def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation='linear'): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Return values at the given quantile over requested axis, + a la numpy.percentile. + + Args: + q (float): 0 <= q <= 1, the quantile(s) to compute + axis (int): 0 or ‘index’ for row-wise, + 1 or ‘columns’ for column-wise + interpolation: {'linear’, ‘lower’, ‘higher’, ‘midpoint’, ‘nearest’} + Specifies which interpolation method to use + + Returns: + quantiles : Series or DataFrame + If q is an array, a DataFrame will be returned where the + index is q, the columns are the columns of self, and the + values are the quantiles. + + If q is a float, a Series will be returned where the + index is the columns of self and the values + are the quantiles. + """ + + if (type(q) is list): + return DataFrame([self.quantile(q_i, axis=axis, + numeric_only=numeric_only, + interpolation=interpolation) + for q_i in q], q, self.index) + + # this section can be replaced with select_dtypes() + + obj_columns = [self.columns[i] + for i, t in enumerate(self.dtypes) + if t == np.dtype('O')] + + rdf = self.drop(columns=obj_columns) + + if axis == 0 or axis is None: + return rdf.T.quantile(q, axis=1, numeric_only=numeric_only, + interpolation=interpolation) + else: + computed_quantiles = [ + _deploy_func.remote( + lambda df: df.quantile(q, axis=1, + numeric_only=numeric_only, + interpolation=interpolation + ), part) + for part in self._df] + + items = ray.get(computed_quantiles) + + _quantile = pd.concat(items) + + return _quantile def query(self, expr, inplace=False, **kwargs): """Queries the Dataframe with a boolean expression @@ -2273,9 +2414,34 @@ class DataFrame(object): def std(self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Computes standard deviation across the DataFrame. + + Args: + axis (int): The axis to take the std on. + skipna (bool): True to skip NA values, false otherwise. + ddof (int): degrees of freedom + + Returns: + The std of the DataFrame (Pandas Series) + """ + if axis == 0 or axis is None: + return self.T.std( + axis=1, skipna=skipna, level=level, + ddof=ddof, numeric_only=numeric_only) + else: + + computed_stds = [_deploy_func.remote( + lambda df: df.T.std( + axis=0, skipna=skipna, level=level, + ddof=ddof, + numeric_only=numeric_only), part) + for part in self._df] + + items = ray.get(computed_stds) + + _stds = pd.concat(items) + + return _stds def sub(self, other, axis='columns', level=None, fill_value=None): raise NotImplementedError( @@ -2529,9 +2695,32 @@ class DataFrame(object): def var(self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Computes variance across the DataFrame. + + Args: + axis (int): The axis to take the variance on. + skipna (bool): True to skip NA values, false otherwise. + ddof (int): degrees of freedom + + Returns: + The variance of the DataFrame. + """ + if axis == 0 or axis is None: + return self.T.var(axis=1, skipna=skipna, level=level, ddof=ddof, + numeric_only=numeric_only) + else: + computed_vars = [_deploy_func.remote(lambda df: df.T.var( + axis=0, skipna=skipna, level=level, + ddof=ddof, + numeric_only=numeric_only), + part) + for part in self._df] + + items = ray.get(computed_vars) + + _var = pd.concat(items) + + return _var def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, errors='raise', try_cast=False, raise_on_error=None): diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index a9bad313e..3813d0baf 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -202,6 +202,15 @@ def test_int_dataframe(): test_round(ray_df, pandas_df) test_query(ray_df, pandas_df, query_funcs) + test_mean(ray_df, pandas_df) + test_var(ray_df, pandas_df) + test_std(ray_df, pandas_df) + test_median(ray_df, pandas_df) + test_quantile(ray_df, pandas_df, .25) + test_quantile(ray_df, pandas_df, .5) + test_quantile(ray_df, pandas_df, .75) + test_describe(ray_df, pandas_df) + test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) test___getitem__(ray_df, pandas_df) @@ -316,6 +325,15 @@ def test_float_dataframe(): test_round(ray_df, pandas_df) test_query(ray_df, pandas_df, query_funcs) + test_mean(ray_df, pandas_df) + test_var(ray_df, pandas_df) + test_std(ray_df, pandas_df) + test_median(ray_df, pandas_df) + test_quantile(ray_df, pandas_df, .25) + test_quantile(ray_df, pandas_df, .5) + test_quantile(ray_df, pandas_df, .75) + test_describe(ray_df, pandas_df) + test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) test___getitem__(ray_df, pandas_df) @@ -432,6 +450,15 @@ def test_mixed_dtype_dataframe(): test_round(ray_df, pandas_df) test_query(ray_df, pandas_df, query_funcs) + test_mean(ray_df, pandas_df) + test_var(ray_df, pandas_df) + test_std(ray_df, pandas_df) + test_median(ray_df, pandas_df) + test_quantile(ray_df, pandas_df, .25) + test_quantile(ray_df, pandas_df, .5) + test_quantile(ray_df, pandas_df, .75) + test_describe(ray_df, pandas_df) + test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) test___getitem__(ray_df, pandas_df) @@ -549,6 +576,15 @@ def test_nan_dataframe(): test_round(ray_df, pandas_df) test_query(ray_df, pandas_df, query_funcs) + test_mean(ray_df, pandas_df) + test_var(ray_df, pandas_df) + test_std(ray_df, pandas_df) + test_median(ray_df, pandas_df) + test_quantile(ray_df, pandas_df, .25) + test_quantile(ray_df, pandas_df, .5) + test_quantile(ray_df, pandas_df, .75) + test_describe(ray_df, pandas_df) + test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) test___getitem__(ray_df, pandas_df) @@ -860,11 +896,9 @@ def test_cumsum(ray_df, pandas_df): assert(ray_df_equals_pandas(ray_df.cumsum(), pandas_df.cumsum())) -def test_describe(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.describe() +@pytest.fixture +def test_describe(ray_df, pandas_df): + assert(ray_df.describe().equals(pandas_df.describe())) def test_diff(): @@ -1743,18 +1777,14 @@ def test_max(ray_df, pandas_df): assert(ray_df_equals_pandas(ray_df.max(), pandas_df.max())) -def test_mean(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.mean() +@pytest.fixture +def test_mean(ray_df, pandas_df): + assert(ray_df.mean().equals(pandas_df.mean())) -def test_median(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.median() +@pytest.fixture +def test_median(ray_df, pandas_df): + assert(ray_df.median().equals(pandas_df.median())) def test_melt(): @@ -1915,11 +1945,9 @@ def test_product(): ray_df.product() -def test_quantile(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.quantile() +@pytest.fixture +def test_quantile(ray_df, pandas_df, q): + assert(ray_df.quantile(q).equals(pandas_df.quantile(q))) @pytest.fixture @@ -2424,11 +2452,9 @@ def test_stack(): ray_df.stack() -def test_std(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.std() +@pytest.fixture +def test_std(ray_df, pandas_df): + assert(ray_df.std().equals(pandas_df.std())) def test_sub(): @@ -2688,11 +2714,9 @@ def test_update(): ray_df.update(None) -def test_var(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.var() +@pytest.fixture +def test_var(ray_df, pandas_df): + assert(ray_df.var().equals(pandas_df.var())) def test_where():