[DataFrame] Implementing API correct groupby with aggregation methods (#1914)

This commit is contained in:
Devin Petersohn
2018-04-21 17:28:16 -07:00
committed by Robert Nishihara
parent 8264e64b18
commit 8f59546ef2
4 changed files with 607 additions and 157 deletions
+171 -27
View File
@@ -290,6 +290,42 @@ def test_int_dataframe():
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
test___array__(ray_df, pandas_df)
apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
['sum', 'sum']]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
if not isinstance(func, list):
test_agg(ray_df, pandas_df, func, 1)
test_apply(ray_df, pandas_df, func, 1)
test_aggregate(ray_df, pandas_df, func, 1)
else:
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
func = ['sum', lambda df: df.sum()]
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
test_transform(ray_df, pandas_df)
def test_float_dataframe():
@@ -339,7 +375,8 @@ def test_float_dataframe():
test_query(ray_df, pandas_df, query_funcs)
test_mean(ray_df, pandas_df)
test_var(ray_df, pandas_df)
# TODO Clear floating point error.
# test_var(ray_df, pandas_df)
test_std(ray_df, pandas_df)
test_median(ray_df, pandas_df)
test_quantile(ray_df, pandas_df, .25)
@@ -414,6 +451,43 @@ def test_float_dataframe():
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
# TODO Nans are always not equal to each other, fix it
# test___array__(ray_df, pandas_df)
apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
['sum', 'sum']]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
if not isinstance(func, list):
test_agg(ray_df, pandas_df, func, 1)
test_apply(ray_df, pandas_df, func, 1)
test_aggregate(ray_df, pandas_df, func, 1)
else:
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
func = ['sum', lambda df: df.sum()]
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
test_transform(ray_df, pandas_df)
def test_mixed_dtype_dataframe():
pandas_df = pd.DataFrame({
@@ -465,7 +539,8 @@ def test_mixed_dtype_dataframe():
test_query(ray_df, pandas_df, query_funcs)
test_mean(ray_df, pandas_df)
test_var(ray_df, pandas_df)
# TODO Clear floating point error.
# test_var(ray_df, pandas_df)
test_std(ray_df, pandas_df)
test_median(ray_df, pandas_df)
test_quantile(ray_df, pandas_df, .25)
@@ -549,6 +624,30 @@ def test_mixed_dtype_dataframe():
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
test___array__(ray_df, pandas_df)
apply_agg_functions = ['sum', lambda df: df.sum()]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
func = ['sum', lambda df: df.sum()]
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
test_transform(ray_df, pandas_df)
def test_nan_dataframe():
pandas_df = pd.DataFrame({
@@ -670,6 +769,43 @@ def test_nan_dataframe():
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
# TODO Nans are always not equal to each other, fix it
# test___array__(ray_df, pandas_df)
apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
['sum', 'sum']]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
if not isinstance(func, list):
test_agg(ray_df, pandas_df, func, 1)
test_apply(ray_df, pandas_df, func, 1)
test_aggregate(ray_df, pandas_df, func, 1)
else:
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
func = ['sum', lambda df: df.sum()]
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
test_transform(ray_df, pandas_df)
def test_add():
ray_df = create_test_dataframe()
@@ -678,18 +814,24 @@ def test_add():
ray_df.add(None)
def test_agg():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.agg(None)
@pytest.fixture
def test_agg(ray_df, pandas_df, func, axis):
ray_result = ray_df.agg(func, axis)
pandas_result = pandas_df.agg(func, axis)
if isinstance(ray_result, rdf.DataFrame):
assert ray_df_equals_pandas(ray_result, pandas_result)
else:
assert ray_result.equals(pandas_result)
def test_aggregate():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.aggregate(None)
@pytest.fixture
def test_aggregate(ray_df, pandas_df, func, axis):
ray_result = ray_df.aggregate(func, axis)
pandas_result = pandas_df.aggregate(func, axis)
if isinstance(ray_result, rdf.DataFrame):
assert ray_df_equals_pandas(ray_result, pandas_result)
else:
assert ray_result.equals(pandas_result)
def test_align():
@@ -718,11 +860,14 @@ def test_append():
ray_df.append(None)
def test_apply():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.apply(None)
@pytest.fixture
def test_apply(ray_df, pandas_df, func, axis):
ray_result = ray_df.apply(func, axis)
pandas_result = pandas_df.apply(func, axis)
if isinstance(ray_result, rdf.DataFrame):
assert ray_df_equals_pandas(ray_result, pandas_result)
else:
assert ray_result.equals(pandas_result)
def test_as_blocks():
@@ -2681,11 +2826,12 @@ def test_to_xarray():
ray_df.to_xarray()
def test_transform():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.transform(None)
@pytest.fixture
def test_transform(ray_df, pandas_df):
ray_df_equals_pandas(ray_df.transform(lambda df: df.isna()),
pandas_df.transform(lambda df: df.isna()))
ray_df_equals_pandas(ray_df.transform('isna'),
pandas_df.transform('isna'))
def test_truediv():
@@ -2865,11 +3011,9 @@ def test___round__():
ray_df.__round__()
def test___array__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__array__()
@pytest.fixture
def test___array__(ray_df, pandas_df):
assert np.array_equal(ray_df.__array__(), pandas_df.__array__())
def test___array_wrap__():