[Dataframes] Implemented .describe() (#1696)

* added describe methods

* mean updates and added truediv func

* updates

* updated truediv test

* porting stocks to ubuntu

* hacky solution for describe, mean, median, quantile by transposing df

* removed data file

* removed faulty truediv implementation

* flake8 and documentation updates

* updated mean, median, var, std to handle mixed values

* added describe methods

* mean updates and added truediv func

* updates

* updated truediv test

* porting stocks to ubuntu

* hacky solution for describe, mean, median, quantile by transposing df

* removed data file

* removed faulty truediv implementation

* flake8 and documentation updates

* fixed quantile to drop object typed columns

* syntax improvements"

* fixed flatten issue

* fixing flatten issue

* minor updates

* added describe methods

* mean updates and added truediv func

* updates

* updated truediv test

* porting stocks to ubuntu

* hacky solution for describe, mean, median, quantile by transposing df

* removed data file

* removed faulty truediv implementation

* flake8 and documentation updates

* updated mean, median, var, std to handle mixed values

* added describe methods

* mean updates and added truediv func

* updates

* updated truediv test

* porting stocks to ubuntu

* hacky solution for describe, mean, median, quantile by transposing df

* removed data file

* removed faulty truediv implementation

* flake8 and documentation updates

* fixed quantile to drop object typed columns

* syntax improvements"

* fixed flatten issue

* fixing flatten issue

* improved describe syntax
This commit is contained in:
Rohan Singh
2018-03-15 21:16:59 -07:00
committed by Devin Petersohn
parent 459fd5e152
commit 1f027344f1
2 changed files with 261 additions and 48 deletions
+207 -18
View File
@@ -822,9 +822,54 @@ class DataFrame(object):
return final_df
def describe(self, percentiles=None, include=None, exclude=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
"""
Generates descriptive statistics that summarize the central tendency,
dispersion and shape of a datasets distribution, excluding NaN values.
Args:
percentiles (list-like of numbers, optional):
The percentiles to include in the output.
include: White-list of data types to include in results
exclude: Black-list of data types to exclude in results
Returns: Series/DataFrame of summary statistics
"""
obj_columns = [self.columns[i]
for i, t in enumerate(self.dtypes)
if t == np.dtype('O')]
rdf = self.drop(columns=obj_columns)
transposed = rdf.T
count_df = rdf.count()
mean_df = transposed.mean(axis=1)
std_df = transposed.std(axis=1)
min_df = to_pandas(rdf.min())
if percentiles is None:
percentiles = [.25, .50, .75]
percentiles_dfs = [transposed.quantile(q, axis=1)
for q in percentiles]
max_df = to_pandas(rdf.max())
describe_df = pd.DataFrame()
describe_df['count'] = count_df
describe_df['mean'] = mean_df
describe_df['std'] = std_df
describe_df['min'] = min_df
for i in range(len(percentiles)):
percentile_str = "{0:.0f}%".format(percentiles[i]*100)
describe_df[percentile_str] = percentiles_dfs[i]
describe_df['max'] = max_df
return describe_df.T
def diff(self, periods=1, axis=0):
raise NotImplementedError(
@@ -1617,15 +1662,62 @@ class DataFrame(object):
def mean(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
"""Computes mean across the DataFrame.
Args:
axis (int): The axis to take the mean on.
skipna (bool): True to skip NA values, false otherwise.
Returns:
The mean of the DataFrame. (Pandas series)
"""
if axis == 0 or axis is None:
return self.T.mean(
axis=1, skipna=skipna,
level=level, numeric_only=numeric_only
)
else:
func = (lambda df: df.T.mean(axis=0,
skipna=None, level=None, numeric_only=None))
computed_means = [
_deploy_func.remote(func, part) for part in self._df]
items = ray.get(computed_means)
_mean = pd.concat(items)
return _mean
def median(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
"""Computes median across the DataFrame.
Args:
axis (int): The axis to take the median on.
skipna (bool): True to skip NA values, false otherwise.
Returns:
The median of the DataFrame. (Pandas series)
"""
if axis == 0 or axis is None:
return self.T.median(
axis=1, level=level, numeric_only=numeric_only
)
else:
func = (lambda df: df.T.median(axis=0, level=level,
numeric_only=numeric_only))
computed_medians = [
_deploy_func.remote(func, part) for part in self._df]
items = ray.get(computed_medians)
_median = pd.concat(items)
return _median
def melt(self, id_vars=None, value_vars=None, var_name=None,
value_name='value', col_level=None):
@@ -1798,9 +1890,58 @@ class DataFrame(object):
def quantile(self, q=0.5, axis=0, numeric_only=True,
interpolation='linear'):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
"""Return values at the given quantile over requested axis,
a la numpy.percentile.
Args:
q (float): 0 <= q <= 1, the quantile(s) to compute
axis (int): 0 or index for row-wise,
1 or columns for column-wise
interpolation: {'linear, lower, higher, midpoint, nearest}
Specifies which interpolation method to use
Returns:
quantiles : Series or DataFrame
If q is an array, a DataFrame will be returned where the
index is q, the columns are the columns of self, and the
values are the quantiles.
If q is a float, a Series will be returned where the
index is the columns of self and the values
are the quantiles.
"""
if (type(q) is list):
return DataFrame([self.quantile(q_i, axis=axis,
numeric_only=numeric_only,
interpolation=interpolation)
for q_i in q], q, self.index)
# this section can be replaced with select_dtypes()
obj_columns = [self.columns[i]
for i, t in enumerate(self.dtypes)
if t == np.dtype('O')]
rdf = self.drop(columns=obj_columns)
if axis == 0 or axis is None:
return rdf.T.quantile(q, axis=1, numeric_only=numeric_only,
interpolation=interpolation)
else:
computed_quantiles = [
_deploy_func.remote(
lambda df: df.quantile(q, axis=1,
numeric_only=numeric_only,
interpolation=interpolation
), part)
for part in self._df]
items = ray.get(computed_quantiles)
_quantile = pd.concat(items)
return _quantile
def query(self, expr, inplace=False, **kwargs):
"""Queries the Dataframe with a boolean expression
@@ -2273,9 +2414,34 @@ class DataFrame(object):
def std(self, axis=None, skipna=None, level=None, ddof=1,
numeric_only=None, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
"""Computes standard deviation across the DataFrame.
Args:
axis (int): The axis to take the std on.
skipna (bool): True to skip NA values, false otherwise.
ddof (int): degrees of freedom
Returns:
The std of the DataFrame (Pandas Series)
"""
if axis == 0 or axis is None:
return self.T.std(
axis=1, skipna=skipna, level=level,
ddof=ddof, numeric_only=numeric_only)
else:
computed_stds = [_deploy_func.remote(
lambda df: df.T.std(
axis=0, skipna=skipna, level=level,
ddof=ddof,
numeric_only=numeric_only), part)
for part in self._df]
items = ray.get(computed_stds)
_stds = pd.concat(items)
return _stds
def sub(self, other, axis='columns', level=None, fill_value=None):
raise NotImplementedError(
@@ -2529,9 +2695,32 @@ class DataFrame(object):
def var(self, axis=None, skipna=None, level=None, ddof=1,
numeric_only=None, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
"""Computes variance across the DataFrame.
Args:
axis (int): The axis to take the variance on.
skipna (bool): True to skip NA values, false otherwise.
ddof (int): degrees of freedom
Returns:
The variance of the DataFrame.
"""
if axis == 0 or axis is None:
return self.T.var(axis=1, skipna=skipna, level=level, ddof=ddof,
numeric_only=numeric_only)
else:
computed_vars = [_deploy_func.remote(lambda df: df.T.var(
axis=0, skipna=skipna, level=level,
ddof=ddof,
numeric_only=numeric_only),
part)
for part in self._df]
items = ray.get(computed_vars)
_var = pd.concat(items)
return _var
def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
errors='raise', try_cast=False, raise_on_error=None):
+54 -30
View File
@@ -202,6 +202,15 @@ def test_int_dataframe():
test_round(ray_df, pandas_df)
test_query(ray_df, pandas_df, query_funcs)
test_mean(ray_df, pandas_df)
test_var(ray_df, pandas_df)
test_std(ray_df, pandas_df)
test_median(ray_df, pandas_df)
test_quantile(ray_df, pandas_df, .25)
test_quantile(ray_df, pandas_df, .5)
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
test___getitem__(ray_df, pandas_df)
@@ -316,6 +325,15 @@ def test_float_dataframe():
test_round(ray_df, pandas_df)
test_query(ray_df, pandas_df, query_funcs)
test_mean(ray_df, pandas_df)
test_var(ray_df, pandas_df)
test_std(ray_df, pandas_df)
test_median(ray_df, pandas_df)
test_quantile(ray_df, pandas_df, .25)
test_quantile(ray_df, pandas_df, .5)
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
test___getitem__(ray_df, pandas_df)
@@ -432,6 +450,15 @@ def test_mixed_dtype_dataframe():
test_round(ray_df, pandas_df)
test_query(ray_df, pandas_df, query_funcs)
test_mean(ray_df, pandas_df)
test_var(ray_df, pandas_df)
test_std(ray_df, pandas_df)
test_median(ray_df, pandas_df)
test_quantile(ray_df, pandas_df, .25)
test_quantile(ray_df, pandas_df, .5)
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
test___getitem__(ray_df, pandas_df)
@@ -549,6 +576,15 @@ def test_nan_dataframe():
test_round(ray_df, pandas_df)
test_query(ray_df, pandas_df, query_funcs)
test_mean(ray_df, pandas_df)
test_var(ray_df, pandas_df)
test_std(ray_df, pandas_df)
test_median(ray_df, pandas_df)
test_quantile(ray_df, pandas_df, .25)
test_quantile(ray_df, pandas_df, .5)
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
test___getitem__(ray_df, pandas_df)
@@ -860,11 +896,9 @@ def test_cumsum(ray_df, pandas_df):
assert(ray_df_equals_pandas(ray_df.cumsum(), pandas_df.cumsum()))
def test_describe():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.describe()
@pytest.fixture
def test_describe(ray_df, pandas_df):
assert(ray_df.describe().equals(pandas_df.describe()))
def test_diff():
@@ -1743,18 +1777,14 @@ def test_max(ray_df, pandas_df):
assert(ray_df_equals_pandas(ray_df.max(), pandas_df.max()))
def test_mean():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.mean()
@pytest.fixture
def test_mean(ray_df, pandas_df):
assert(ray_df.mean().equals(pandas_df.mean()))
def test_median():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.median()
@pytest.fixture
def test_median(ray_df, pandas_df):
assert(ray_df.median().equals(pandas_df.median()))
def test_melt():
@@ -1915,11 +1945,9 @@ def test_product():
ray_df.product()
def test_quantile():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.quantile()
@pytest.fixture
def test_quantile(ray_df, pandas_df, q):
assert(ray_df.quantile(q).equals(pandas_df.quantile(q)))
@pytest.fixture
@@ -2424,11 +2452,9 @@ def test_stack():
ray_df.stack()
def test_std():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.std()
@pytest.fixture
def test_std(ray_df, pandas_df):
assert(ray_df.std().equals(pandas_df.std()))
def test_sub():
@@ -2688,11 +2714,9 @@ def test_update():
ray_df.update(None)
def test_var():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.var()
@pytest.fixture
def test_var(ray_df, pandas_df):
assert(ray_df.var().equals(pandas_df.var()))
def test_where():