Files
ray/python/ray/dataframe/test/test_dataframe.py
T
2018-04-23 17:09:57 -07:00

3077 lines
84 KiB
Python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pytest
import numpy as np
import pandas as pd
import pandas.util.testing as tm
import ray.dataframe as rdf
from ray.dataframe.utils import (
from_pandas,
to_pandas)
from pandas.tests.frame.common import TestData
@pytest.fixture
def ray_df_equals_pandas(ray_df, pandas_df):
return to_pandas(ray_df).sort_index().equals(pandas_df.sort_index())
@pytest.fixture
def ray_series_equals_pandas(ray_df, pandas_df):
return ray_df.sort_index().equals(pandas_df.sort_index())
@pytest.fixture
def ray_df_equals(ray_df1, ray_df2):
return to_pandas(ray_df1).sort_index().equals(
to_pandas(ray_df2).sort_index()
)
@pytest.fixture
def test_roundtrip(ray_df, pandas_df):
assert(ray_df_equals_pandas(ray_df, pandas_df))
@pytest.fixture
def test_index(ray_df, pandas_df):
assert(ray_df.index.equals(pandas_df.index))
ray_df_cp = ray_df.copy()
pandas_df_cp = pandas_df.copy()
ray_df_cp.index = [str(i) for i in ray_df_cp.index]
pandas_df_cp.index = [str(i) for i in pandas_df_cp.index]
assert(ray_df_cp.index.sort_values().equals(pandas_df_cp.index))
@pytest.fixture
def test_size(ray_df, pandas_df):
assert(ray_df.size == pandas_df.size)
@pytest.fixture
def test_ndim(ray_df, pandas_df):
assert(ray_df.ndim == pandas_df.ndim)
@pytest.fixture
def test_ftypes(ray_df, pandas_df):
assert(ray_df.ftypes.equals(pandas_df.ftypes))
@pytest.fixture
def test_dtypes(ray_df, pandas_df):
assert(ray_df.dtypes.equals(pandas_df.dtypes))
@pytest.fixture
def test_values(ray_df, pandas_df):
np.testing.assert_equal(ray_df.values, pandas_df.values)
@pytest.fixture
def test_axes(ray_df, pandas_df):
for ray_axis, pd_axis in zip(ray_df.axes, pandas_df.axes):
assert (np.array_equal(ray_axis, pd_axis))
@pytest.fixture
def test_shape(ray_df, pandas_df):
assert(ray_df.shape == pandas_df.shape)
@pytest.fixture
def test_add_prefix(ray_df, pandas_df):
test_prefix = "TEST"
new_ray_df = ray_df.add_prefix(test_prefix)
new_pandas_df = pandas_df.add_prefix(test_prefix)
assert(new_ray_df.columns.equals(new_pandas_df.columns))
@pytest.fixture
def test_add_suffix(ray_df, pandas_df):
test_suffix = "TEST"
new_ray_df = ray_df.add_suffix(test_suffix)
new_pandas_df = pandas_df.add_suffix(test_suffix)
assert(new_ray_df.columns.equals(new_pandas_df.columns))
@pytest.fixture
def test_applymap(ray_df, pandas_df, testfunc):
new_ray_df = ray_df.applymap(testfunc)
new_pandas_df = pandas_df.applymap(testfunc)
assert(ray_df_equals_pandas(new_ray_df, new_pandas_df))
@pytest.fixture
def test_copy(ray_df):
new_ray_df = ray_df.copy()
assert new_ray_df is not ray_df
assert np.array_equal(new_ray_df._block_partitions,
ray_df._block_partitions)
@pytest.fixture
def test_sum(ray_df, pandas_df):
assert(ray_df.sum().sort_index().equals(pandas_df.sum().sort_index()))
@pytest.fixture
def test_abs(ray_df, pandas_df):
assert(ray_df_equals_pandas(ray_df.abs(), pandas_df.abs()))
@pytest.fixture
def test_keys(ray_df, pandas_df):
assert(ray_df.keys().equals(pandas_df.keys()))
@pytest.fixture
def test_transpose(ray_df, pandas_df):
assert(ray_df_equals_pandas(ray_df.T, pandas_df.T))
assert(ray_df_equals_pandas(ray_df.transpose(), pandas_df.transpose()))
@pytest.fixture
def test_get(ray_df, pandas_df, key):
assert(ray_df.get(key).equals(pandas_df.get(key)))
assert ray_df.get(
key, default='default').equals(
pandas_df.get(key, default='default'))
@pytest.fixture
def test_get_dtype_counts(ray_df, pandas_df):
assert(ray_df.get_dtype_counts().equals(pandas_df.get_dtype_counts()))
@pytest.fixture
def test_get_ftype_counts(ray_df, pandas_df):
assert(ray_df.get_ftype_counts().equals(pandas_df.get_ftype_counts()))
@pytest.fixture
def create_test_dataframe():
df = pd.DataFrame({'col1': [0, 1, 2, 3],
'col2': [4, 5, 6, 7],
'col3': [8, 9, 10, 11],
'col4': [12, 13, 14, 15],
'col5': [0, 0, 0, 0]})
return from_pandas(df, 2)
def test_int_dataframe():
pandas_df = pd.DataFrame({'col1': [0, 1, 2, 3],
'col2': [4, 5, 6, 7],
'col3': [8, 9, 10, 11],
'col4': [12, 13, 14, 15],
'col5': [0, 0, 0, 0]})
ray_df = from_pandas(pandas_df, 2)
testfuncs = [lambda x: x + 1,
lambda x: str(x),
lambda x: x * x,
lambda x: x,
lambda x: False]
query_funcs = ['col1 < col2', 'col3 > col4', 'col1 == col2',
'(col2 > col1) and (col1 < col3)']
keys = ['col1',
'col2',
'col3',
'col4']
test_roundtrip(ray_df, pandas_df)
test_index(ray_df, pandas_df)
test_size(ray_df, pandas_df)
test_ndim(ray_df, pandas_df)
test_ftypes(ray_df, pandas_df)
test_dtypes(ray_df, pandas_df)
test_values(ray_df, pandas_df)
test_axes(ray_df, pandas_df)
test_shape(ray_df, pandas_df)
test_add_prefix(ray_df, pandas_df)
test_add_suffix(ray_df, pandas_df)
for testfunc in testfuncs:
test_applymap(ray_df, pandas_df, testfunc)
test_copy(ray_df)
test_sum(ray_df, pandas_df)
test_abs(ray_df, pandas_df)
test_keys(ray_df, pandas_df)
test_transpose(ray_df, pandas_df)
test_round(ray_df, pandas_df)
test_query(ray_df, pandas_df, query_funcs)
test_mean(ray_df, pandas_df)
test_var(ray_df, pandas_df)
test_std(ray_df, pandas_df)
test_median(ray_df, pandas_df)
test_quantile(ray_df, pandas_df, .25)
test_quantile(ray_df, pandas_df, .5)
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
test___getitem__(ray_df, pandas_df)
test___neg__(ray_df, pandas_df)
test___iter__(ray_df, pandas_df)
test___abs__(ray_df, pandas_df)
test___delitem__(ray_df, pandas_df)
test___copy__(ray_df, pandas_df)
test___deepcopy__(ray_df, pandas_df)
test_bool(ray_df, pandas_df)
test_count(ray_df, pandas_df)
test_head(ray_df, pandas_df, 2)
test_head(ray_df, pandas_df)
test_tail(ray_df, pandas_df)
test_idxmax(ray_df, pandas_df)
test_idxmin(ray_df, pandas_df)
test_pop(ray_df, pandas_df)
test___len__(ray_df, pandas_df)
test_first_valid_index(ray_df, pandas_df)
test_last_valid_index(ray_df, pandas_df)
for key in keys:
test_get(ray_df, pandas_df, key)
test_get_dtype_counts(ray_df, pandas_df)
test_get_ftype_counts(ray_df, pandas_df)
test_iterrows(ray_df, pandas_df)
test_items(ray_df, pandas_df)
test_iteritems(ray_df, pandas_df)
test_itertuples(ray_df, pandas_df)
test_max(ray_df, pandas_df)
test_min(ray_df, pandas_df)
test_notna(ray_df, pandas_df)
test_notnull(ray_df, pandas_df)
test_cummax(ray_df, pandas_df)
test_cummin(ray_df, pandas_df)
test_cumprod(ray_df, pandas_df)
test_cumsum(ray_df, pandas_df)
# test_loc(ray_df, pandas_df)
# test_iloc(ray_df, pandas_df)
labels = ['a', 'b', 'c', 'd']
test_set_axis(ray_df, pandas_df, labels, 0)
test_set_axis(ray_df, pandas_df, labels, 'rows')
labels.append('e')
test_set_axis(ray_df, pandas_df, labels, 1)
test_set_axis(ray_df, pandas_df, labels, 'columns')
for key in keys:
test_set_index(ray_df, pandas_df, key)
test_reset_index(ray_df, pandas_df)
test_reset_index(ray_df, pandas_df, inplace=True)
for key in keys:
test___contains__(ray_df, key, True)
test___contains__(ray_df, "Not Exists", False)
for key in keys:
test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key])
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
test___array__(ray_df, pandas_df)
apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
['sum', 'sum']]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
if not isinstance(func, list):
test_agg(ray_df, pandas_df, func, 1)
test_apply(ray_df, pandas_df, func, 1)
test_aggregate(ray_df, pandas_df, func, 1)
else:
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
func = ['sum', lambda df: df.sum()]
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
test_transform(ray_df, pandas_df)
def test_float_dataframe():
pandas_df = pd.DataFrame({'col1': [0.0, 1.0, 2.0, 3.0],
'col2': [4.0, 5.0, 6.0, 7.0],
'col3': [8.0, 9.0, 10.0, 11.0],
'col4': [12.0, 13.0, 14.0, 15.0],
'col5': [0.0, 0.0, 0.0, 0.0]})
ray_df = from_pandas(pandas_df, 3)
testfuncs = [lambda x: x + 1,
lambda x: str(x),
lambda x: x * x,
lambda x: x,
lambda x: False]
query_funcs = ['col1 < col2', 'col3 > col4', 'col1 == col2',
'(col2 > col1) and (col1 < col3)']
keys = ['col1',
'col2',
'col3',
'col4']
test_roundtrip(ray_df, pandas_df)
test_index(ray_df, pandas_df)
test_size(ray_df, pandas_df)
test_ndim(ray_df, pandas_df)
test_ftypes(ray_df, pandas_df)
test_dtypes(ray_df, pandas_df)
test_values(ray_df, pandas_df)
test_axes(ray_df, pandas_df)
test_shape(ray_df, pandas_df)
test_add_prefix(ray_df, pandas_df)
test_add_suffix(ray_df, pandas_df)
for testfunc in testfuncs:
test_applymap(ray_df, pandas_df, testfunc)
test_copy(ray_df)
test_sum(ray_df, pandas_df)
test_abs(ray_df, pandas_df)
test_keys(ray_df, pandas_df)
test_transpose(ray_df, pandas_df)
test_round(ray_df, pandas_df)
test_query(ray_df, pandas_df, query_funcs)
test_mean(ray_df, pandas_df)
# TODO Clear floating point error.
# test_var(ray_df, pandas_df)
test_std(ray_df, pandas_df)
test_median(ray_df, pandas_df)
test_quantile(ray_df, pandas_df, .25)
test_quantile(ray_df, pandas_df, .5)
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
test___getitem__(ray_df, pandas_df)
test___neg__(ray_df, pandas_df)
test___iter__(ray_df, pandas_df)
test___abs__(ray_df, pandas_df)
test___delitem__(ray_df, pandas_df)
test___copy__(ray_df, pandas_df)
test___deepcopy__(ray_df, pandas_df)
test_bool(ray_df, pandas_df)
test_count(ray_df, pandas_df)
test_head(ray_df, pandas_df, 3)
test_head(ray_df, pandas_df)
test_tail(ray_df, pandas_df)
test_idxmax(ray_df, pandas_df)
test_idxmin(ray_df, pandas_df)
test_pop(ray_df, pandas_df)
test_max(ray_df, pandas_df)
test_min(ray_df, pandas_df)
test_notna(ray_df, pandas_df)
test_notnull(ray_df, pandas_df)
test_cummax(ray_df, pandas_df)
test_cummin(ray_df, pandas_df)
test_cumprod(ray_df, pandas_df)
test_cumsum(ray_df, pandas_df)
test___len__(ray_df, pandas_df)
test_first_valid_index(ray_df, pandas_df)
test_last_valid_index(ray_df, pandas_df)
for key in keys:
test_get(ray_df, pandas_df, key)
test_get_dtype_counts(ray_df, pandas_df)
test_get_ftype_counts(ray_df, pandas_df)
test_iterrows(ray_df, pandas_df)
test_items(ray_df, pandas_df)
test_iteritems(ray_df, pandas_df)
test_itertuples(ray_df, pandas_df)
# test_loc(ray_df, pandas_df)
# test_iloc(ray_df, pandas_df)
labels = ['a', 'b', 'c', 'd']
test_set_axis(ray_df, pandas_df, labels, 0)
test_set_axis(ray_df, pandas_df, labels, 'rows')
labels.append('e')
test_set_axis(ray_df, pandas_df, labels, 1)
test_set_axis(ray_df, pandas_df, labels, 'columns')
for key in keys:
test_set_index(ray_df, pandas_df, key)
test_set_index(ray_df, pandas_df, key, inplace=True)
test_reset_index(ray_df, pandas_df)
test_reset_index(ray_df, pandas_df, inplace=True)
for key in keys:
test___contains__(ray_df, key, True)
test___contains__(ray_df, "Not Exists", False)
for key in keys:
test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key])
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
# TODO Nans are always not equal to each other, fix it
# test___array__(ray_df, pandas_df)
apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
['sum', 'sum']]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
if not isinstance(func, list):
test_agg(ray_df, pandas_df, func, 1)
test_apply(ray_df, pandas_df, func, 1)
test_aggregate(ray_df, pandas_df, func, 1)
else:
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
func = ['sum', lambda df: df.sum()]
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
test_transform(ray_df, pandas_df)
def test_mixed_dtype_dataframe():
pandas_df = pd.DataFrame({
'col1': [1, 2, 3, 4],
'col2': [4, 5, 6, 7],
'col3': [8.0, 9.4, 10.1, 11.3],
'col4': ['a', 'b', 'c', 'd']})
ray_df = from_pandas(pandas_df, 2)
testfuncs = [lambda x: x + x,
lambda x: str(x),
lambda x: x,
lambda x: False]
query_funcs = ['col1 < col2', 'col1 == col2',
'(col2 > col1) and (col1 < col3)']
keys = ['col1',
'col2',
'col3',
'col4']
test_roundtrip(ray_df, pandas_df)
test_index(ray_df, pandas_df)
test_size(ray_df, pandas_df)
test_ndim(ray_df, pandas_df)
test_ftypes(ray_df, pandas_df)
test_dtypes(ray_df, pandas_df)
test_values(ray_df, pandas_df)
test_axes(ray_df, pandas_df)
test_shape(ray_df, pandas_df)
test_add_prefix(ray_df, pandas_df)
test_add_suffix(ray_df, pandas_df)
for testfunc in testfuncs:
test_applymap(ray_df, pandas_df, testfunc)
test_copy(ray_df)
test_sum(ray_df, pandas_df)
with pytest.raises(TypeError):
test_abs(ray_df, pandas_df)
test___abs__(ray_df, pandas_df)
test_keys(ray_df, pandas_df)
test_transpose(ray_df, pandas_df)
test_round(ray_df, pandas_df)
test_query(ray_df, pandas_df, query_funcs)
test_mean(ray_df, pandas_df)
# TODO Clear floating point error.
# test_var(ray_df, pandas_df)
test_std(ray_df, pandas_df)
test_median(ray_df, pandas_df)
test_quantile(ray_df, pandas_df, .25)
test_quantile(ray_df, pandas_df, .5)
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
test___getitem__(ray_df, pandas_df)
with pytest.raises(TypeError):
test___neg__(ray_df, pandas_df)
test___iter__(ray_df, pandas_df)
test___delitem__(ray_df, pandas_df)
test___copy__(ray_df, pandas_df)
test___deepcopy__(ray_df, pandas_df)
test_bool(ray_df, pandas_df)
test_count(ray_df, pandas_df)
test_head(ray_df, pandas_df, 2)
test_head(ray_df, pandas_df)
test_tail(ray_df, pandas_df)
with pytest.raises(TypeError):
test_idxmax(ray_df, pandas_df)
with pytest.raises(TypeError):
test_idxmin(ray_df, pandas_df)
test_pop(ray_df, pandas_df)
test_max(ray_df, pandas_df)
test_min(ray_df, pandas_df)
test_notna(ray_df, pandas_df)
test_notnull(ray_df, pandas_df)
# TODO Fix pandas so that the behavior is correct
# We discovered a bug where argmax does not always give the same result
# depending on what your other dtypes are.
# test_cummax(ray_df, pandas_df)
# test_cummin(ray_df, pandas_df)
# test_cumprod(ray_df, pandas_df)
# test_cumsum(ray_df, pandas_df)
test___len__(ray_df, pandas_df)
test_first_valid_index(ray_df, pandas_df)
test_last_valid_index(ray_df, pandas_df)
for key in keys:
test_get(ray_df, pandas_df, key)
test_get_dtype_counts(ray_df, pandas_df)
test_get_ftype_counts(ray_df, pandas_df)
test_iterrows(ray_df, pandas_df)
test_items(ray_df, pandas_df)
test_iteritems(ray_df, pandas_df)
test_itertuples(ray_df, pandas_df)
# test_loc(ray_df, pandas_df)
# test_iloc(ray_df, pandas_df)
labels = ['a', 'b', 'c', 'd']
test_set_axis(ray_df, pandas_df, labels, 0)
test_set_axis(ray_df, pandas_df, labels, 'rows')
test_set_axis(ray_df, pandas_df, labels, 1)
test_set_axis(ray_df, pandas_df, labels, 'columns')
for key in keys:
test_set_index(ray_df, pandas_df, key)
test_set_index(ray_df, pandas_df, key, inplace=True)
test_reset_index(ray_df, pandas_df)
test_reset_index(ray_df, pandas_df, inplace=True)
for key in keys:
test___contains__(ray_df, key, True)
test___contains__(ray_df, "Not Exists", False)
for key in keys:
test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key])
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
test___array__(ray_df, pandas_df)
apply_agg_functions = ['sum', lambda df: df.sum()]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
func = ['sum', lambda df: df.sum()]
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
test_transform(ray_df, pandas_df)
def test_nan_dataframe():
pandas_df = pd.DataFrame({
'col1': [1, 2, 3, np.nan],
'col2': [4, 5, np.nan, 7],
'col3': [8, np.nan, 10, 11],
'col4': [np.nan, 13, 14, 15]})
ray_df = from_pandas(pandas_df, 2)
testfuncs = [lambda x: x + x,
lambda x: str(x),
lambda x: x,
lambda x: False]
query_funcs = ['col1 < col2', 'col3 > col4', 'col1 == col2',
'(col2 > col1) and (col1 < col3)']
keys = ['col1',
'col2',
'col3',
'col4']
test_roundtrip(ray_df, pandas_df)
test_index(ray_df, pandas_df)
test_size(ray_df, pandas_df)
test_ndim(ray_df, pandas_df)
test_ftypes(ray_df, pandas_df)
test_dtypes(ray_df, pandas_df)
test_values(ray_df, pandas_df)
test_axes(ray_df, pandas_df)
test_shape(ray_df, pandas_df)
test_add_prefix(ray_df, pandas_df)
test_add_suffix(ray_df, pandas_df)
for testfunc in testfuncs:
test_applymap(ray_df, pandas_df, testfunc)
test_copy(ray_df)
test_sum(ray_df, pandas_df)
test_abs(ray_df, pandas_df)
test_keys(ray_df, pandas_df)
test_transpose(ray_df, pandas_df)
test_round(ray_df, pandas_df)
test_query(ray_df, pandas_df, query_funcs)
test_mean(ray_df, pandas_df)
test_var(ray_df, pandas_df)
test_std(ray_df, pandas_df)
test_median(ray_df, pandas_df)
test_quantile(ray_df, pandas_df, .25)
test_quantile(ray_df, pandas_df, .5)
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
test___getitem__(ray_df, pandas_df)
test___neg__(ray_df, pandas_df)
test___iter__(ray_df, pandas_df)
test___abs__(ray_df, pandas_df)
test___delitem__(ray_df, pandas_df)
test___copy__(ray_df, pandas_df)
test___deepcopy__(ray_df, pandas_df)
test_bool(ray_df, pandas_df)
test_count(ray_df, pandas_df)
test_head(ray_df, pandas_df, 2)
test_head(ray_df, pandas_df)
test_tail(ray_df, pandas_df)
test_idxmax(ray_df, pandas_df)
test_idxmin(ray_df, pandas_df)
test_pop(ray_df, pandas_df)
test_max(ray_df, pandas_df)
test_min(ray_df, pandas_df)
test_notna(ray_df, pandas_df)
test_notnull(ray_df, pandas_df)
test_cummax(ray_df, pandas_df)
test_cummin(ray_df, pandas_df)
test_cumprod(ray_df, pandas_df)
test_cumsum(ray_df, pandas_df)
test___len__(ray_df, pandas_df)
test_first_valid_index(ray_df, pandas_df)
test_last_valid_index(ray_df, pandas_df)
for key in keys:
test_get(ray_df, pandas_df, key)
test_get_dtype_counts(ray_df, pandas_df)
test_get_ftype_counts(ray_df, pandas_df)
test_iterrows(ray_df, pandas_df)
test_items(ray_df, pandas_df)
test_iteritems(ray_df, pandas_df)
test_itertuples(ray_df, pandas_df)
# test_loc(ray_df, pandas_df)
# test_iloc(ray_df, pandas_df)
labels = ['a', 'b', 'c', 'd']
test_set_axis(ray_df, pandas_df, labels, 0)
test_set_axis(ray_df, pandas_df, labels, 'rows')
test_set_axis(ray_df, pandas_df, labels, 1)
test_set_axis(ray_df, pandas_df, labels, 'columns')
for key in keys:
test_set_index(ray_df, pandas_df, key)
test_set_index(ray_df, pandas_df, key, inplace=True)
test_reset_index(ray_df, pandas_df)
test_reset_index(ray_df, pandas_df, inplace=True)
for key in keys:
test___contains__(ray_df, key, True)
test___contains__(ray_df, "Not Exists", False)
for key in keys:
test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key])
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
# TODO Nans are always not equal to each other, fix it
# test___array__(ray_df, pandas_df)
apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
['sum', 'sum']]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
if not isinstance(func, list):
test_agg(ray_df, pandas_df, func, 1)
test_apply(ray_df, pandas_df, func, 1)
test_aggregate(ray_df, pandas_df, func, 1)
else:
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
func = ['sum', lambda df: df.sum()]
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
test_transform(ray_df, pandas_df)
def test_add():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.add(None)
@pytest.fixture
def test_agg(ray_df, pandas_df, func, axis):
ray_result = ray_df.agg(func, axis)
pandas_result = pandas_df.agg(func, axis)
if isinstance(ray_result, rdf.DataFrame):
assert ray_df_equals_pandas(ray_result, pandas_result)
else:
assert ray_result.equals(pandas_result)
@pytest.fixture
def test_aggregate(ray_df, pandas_df, func, axis):
ray_result = ray_df.aggregate(func, axis)
pandas_result = pandas_df.aggregate(func, axis)
if isinstance(ray_result, rdf.DataFrame):
assert ray_df_equals_pandas(ray_result, pandas_result)
else:
assert ray_result.equals(pandas_result)
def test_align():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.align(None)
@pytest.fixture
def test_all(ray_df, pd_df):
assert pd_df.all().equals(ray_df.all())
assert pd_df.all(axis=1).equals(ray_df.all(axis=1))
@pytest.fixture
def test_any(ray_df, pd_df):
assert pd_df.any().equals(ray_df.any())
assert pd_df.any(axis=1).equals(ray_df.any(axis=1))
def test_append():
ray_df = rdf.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7],
"col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]})
pandas_df = pd.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7],
"col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]})
ray_df2 = rdf.DataFrame({"col5": [0], "col6": [1]})
pandas_df2 = pd.DataFrame({"col5": [0], "col6": [1]})
assert ray_df_equals_pandas(ray_df.append(ray_df2),
pandas_df.append(pandas_df2))
with pytest.raises(ValueError):
ray_df.append(ray_df2, verify_integrity=True)
@pytest.fixture
def test_apply(ray_df, pandas_df, func, axis):
ray_result = ray_df.apply(func, axis)
pandas_result = pandas_df.apply(func, axis)
if isinstance(ray_result, rdf.DataFrame):
assert ray_df_equals_pandas(ray_result, pandas_result)
else:
assert ray_result.equals(pandas_result)
def test_as_blocks():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.as_blocks()
def test_as_matrix():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.as_matrix()
def test_asfreq():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.asfreq(None)
def test_asof():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.asof(None)
def test_assign():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.assign()
def test_astype():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.astype(None)
def test_at_time():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.at_time(None)
def test_between_time():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.between_time(None, None)
@pytest.fixture
def test_bfill(num_partitions=2):
test_data = TestData()
test_data.tsframe['A'][:5] = np.nan
test_data.tsframe['A'][-5:] = np.nan
ray_df = from_pandas(test_data.tsframe, num_partitions)
assert ray_df_equals_pandas(
ray_df.bfill(),
test_data.tsframe.bfill()
)
@pytest.fixture
def test_bool(ray_df, pd_df):
with pytest.raises(ValueError):
ray_df.bool()
pd_df.bool()
single_bool_pd_df = pd.DataFrame([True])
single_bool_ray_df = from_pandas(single_bool_pd_df, 1)
assert single_bool_pd_df.bool() == single_bool_ray_df.bool()
def test_boxplot():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.boxplot()
def test_clip():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.clip()
def test_clip_lower():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.clip_lower(None)
def test_clip_upper():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.clip_upper(None)
def test_combine():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.combine(None, None)
def test_combine_first():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.combine_first(None)
def test_compound():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.compound()
def test_consolidate():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.consolidate()
def test_convert_objects():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.convert_objects()
def test_corr():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.corr()
def test_corrwith():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.corrwith(None)
@pytest.fixture
def test_count(ray_df, pd_df):
assert ray_df.count().equals(pd_df.count())
assert ray_df.count(axis=1).equals(pd_df.count(axis=1))
def test_cov():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.cov()
@pytest.fixture
def test_cummax(ray_df, pandas_df):
assert(ray_df_equals_pandas(ray_df.cummax(), pandas_df.cummax()))
@pytest.fixture
def test_cummin(ray_df, pandas_df):
assert(ray_df_equals_pandas(ray_df.cummin(), pandas_df.cummin()))
@pytest.fixture
def test_cumprod(ray_df, pandas_df):
assert(ray_df_equals_pandas(ray_df.cumprod(), pandas_df.cumprod()))
@pytest.fixture
def test_cumsum(ray_df, pandas_df):
assert(ray_df_equals_pandas(ray_df.cumsum(), pandas_df.cumsum()))
@pytest.fixture
def test_describe(ray_df, pandas_df):
assert(ray_df.describe().equals(pandas_df.describe()))
def test_diff():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.diff()
def test_div():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.div(None)
def test_divide():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.divide(None)
def test_dot():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.dot(None)
def test_drop():
ray_df = create_test_dataframe()
simple = pd.DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]})
ray_simple = from_pandas(simple, 2)
assert ray_df_equals_pandas(ray_simple.drop("A", axis=1), simple[['B']])
assert ray_df_equals_pandas(ray_simple.drop(["A", "B"], axis='columns'),
simple[[]])
assert ray_df_equals_pandas(ray_simple.drop([0, 1, 3], axis=0),
simple.loc[[2], :])
assert ray_df_equals_pandas(ray_simple.drop([0, 3], axis='index'),
simple.loc[[1, 2], :])
pytest.raises(ValueError, ray_simple.drop, 5)
pytest.raises(ValueError, ray_simple.drop, 'C', 1)
pytest.raises(ValueError, ray_simple.drop, [1, 5])
pytest.raises(ValueError, ray_simple.drop, ['A', 'C'], 1)
# errors = 'ignore'
assert ray_df_equals_pandas(ray_simple.drop(5, errors='ignore'), simple)
assert ray_df_equals_pandas(ray_simple.drop([0, 5], errors='ignore'),
simple.loc[[1, 2, 3], :])
assert ray_df_equals_pandas(ray_simple.drop('C', axis=1, errors='ignore'),
simple)
assert ray_df_equals_pandas(ray_simple.drop(['A', 'C'], axis=1,
errors='ignore'),
simple[['B']])
# non-unique - wheee!
nu_df = pd.DataFrame(pd.compat.lzip(range(3), range(-3, 1), list('abc')),
columns=['a', 'a', 'b'])
ray_nu_df = from_pandas(nu_df, 3)
assert ray_df_equals_pandas(ray_nu_df.drop('a', axis=1), nu_df[['b']])
assert ray_df_equals_pandas(ray_nu_df.drop('b', axis='columns'),
nu_df['a'])
assert ray_df_equals_pandas(ray_nu_df.drop([]), nu_df) # GH 16398
nu_df = nu_df.set_index(pd.Index(['X', 'Y', 'X']))
nu_df.columns = list('abc')
ray_nu_df = from_pandas(nu_df, 3)
assert ray_df_equals_pandas(ray_nu_df.drop('X', axis='rows'),
nu_df.loc[["Y"], :])
assert ray_df_equals_pandas(ray_nu_df.drop(['X', 'Y'], axis=0),
nu_df.loc[[], :])
# inplace cache issue
# GH 5628
df = pd.DataFrame(np.random.randn(10, 3), columns=list('abc'))
ray_df = from_pandas(df, 2)
expected = df[~(df.b > 0)]
ray_df.drop(labels=df[df.b > 0].index, inplace=True)
assert ray_df_equals_pandas(ray_df, expected)
def test_drop_api_equivalence():
# equivalence of the labels/axis and index/columns API's (GH12392)
df = pd.DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]],
index=['a', 'b', 'c'],
columns=['d', 'e', 'f'])
ray_df = from_pandas(df, 3)
res1 = ray_df.drop('a')
res2 = ray_df.drop(index='a')
assert ray_df_equals(res1, res2)
res1 = ray_df.drop('d', 1)
res2 = ray_df.drop(columns='d')
assert ray_df_equals(res1, res2)
res1 = ray_df.drop(labels='e', axis=1)
res2 = ray_df.drop(columns='e')
assert ray_df_equals(res1, res2)
res1 = ray_df.drop(['a'], axis=0)
res2 = ray_df.drop(index=['a'])
assert ray_df_equals(res1, res2)
res1 = ray_df.drop(['a'], axis=0).drop(['d'], axis=1)
res2 = ray_df.drop(index=['a'], columns=['d'])
assert ray_df_equals(res1, res2)
with pytest.raises(ValueError):
ray_df.drop(labels='a', index='b')
with pytest.raises(ValueError):
ray_df.drop(labels='a', columns='b')
with pytest.raises(ValueError):
ray_df.drop(axis=1)
def test_drop_duplicates():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.drop_duplicates()
def test_duplicated():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.duplicated()
def test_eq():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.eq(None)
def test_equals():
pandas_df1 = pd.DataFrame({'col1': [2.9, 3, 3, 3],
'col2': [2, 3, 4, 1]})
ray_df1 = from_pandas(pandas_df1, 2)
ray_df2 = from_pandas(pandas_df1, 3)
assert ray_df1.equals(ray_df2)
pandas_df2 = pd.DataFrame({'col1': [2.9, 3, 3, 3],
'col2': [2, 3, 5, 1]})
ray_df3 = from_pandas(pandas_df2, 4)
assert not ray_df3.equals(ray_df1)
assert not ray_df3.equals(ray_df2)
def test_eval_df_use_case():
df = pd.DataFrame({'a': np.random.randn(10),
'b': np.random.randn(10)})
ray_df = from_pandas(df, 2)
df.eval("e = arctan2(sin(a), b)",
engine='python',
parser='pandas', inplace=True)
ray_df.eval("e = arctan2(sin(a), b)",
engine='python',
parser='pandas', inplace=True)
# TODO: Use a series equality validator.
assert ray_df_equals_pandas(ray_df, df)
def test_eval_df_arithmetic_subexpression():
df = pd.DataFrame({'a': np.random.randn(10),
'b': np.random.randn(10)})
ray_df = from_pandas(df, 2)
df.eval("not_e = sin(a + b)",
engine='python',
parser='pandas', inplace=True)
ray_df.eval("not_e = sin(a + b)",
engine='python',
parser='pandas', inplace=True)
# TODO: Use a series equality validator.
assert ray_df_equals_pandas(ray_df, df)
def test_ewm():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.ewm()
def test_expanding():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.expanding()
@pytest.fixture
def test_ffill(num_partitions=2):
test_data = TestData()
test_data.tsframe['A'][:5] = np.nan
test_data.tsframe['A'][-5:] = np.nan
ray_df = from_pandas(test_data.tsframe, num_partitions)
assert ray_df_equals_pandas(
ray_df.ffill(),
test_data.tsframe.ffill()
)
def test_fillna():
test_fillna_sanity()
test_fillna_downcast()
test_ffill()
test_ffill2()
test_bfill()
test_bfill2()
test_fillna_inplace()
# test_frame_fillna_limit()
# test_frame_pad_backfill_limit()
test_fillna_dtype_conversion()
test_fillna_skip_certain_blocks()
test_fillna_dict_series()
with pytest.raises(NotImplementedError):
test_fillna_dataframe()
test_fillna_columns()
test_fillna_invalid_method()
test_fillna_invalid_value()
test_fillna_col_reordering()
@pytest.fixture
def test_fillna_sanity(num_partitions=2):
test_data = TestData()
tf = test_data.tsframe
tf.loc[tf.index[:5], 'A'] = np.nan
tf.loc[tf.index[-5:], 'A'] = np.nan
zero_filled = test_data.tsframe.fillna(0)
ray_df = from_pandas(test_data.tsframe, num_partitions).fillna(0)
assert ray_df_equals_pandas(ray_df, zero_filled)
padded = test_data.tsframe.fillna(method='pad')
ray_df = from_pandas(test_data.tsframe,
num_partitions).fillna(method='pad')
assert ray_df_equals_pandas(ray_df, padded)
# mixed type
mf = test_data.mixed_frame
mf.loc[mf.index[5:20], 'foo'] = np.nan
mf.loc[mf.index[-10:], 'A'] = np.nan
result = test_data.mixed_frame.fillna(value=0)
ray_df = from_pandas(test_data.mixed_frame,
num_partitions).fillna(value=0)
assert ray_df_equals_pandas(ray_df, result)
result = test_data.mixed_frame.fillna(method='pad')
ray_df = from_pandas(test_data.mixed_frame,
num_partitions).fillna(method='pad')
assert ray_df_equals_pandas(ray_df, result)
pytest.raises(ValueError, test_data.tsframe.fillna)
pytest.raises(ValueError, from_pandas(test_data.tsframe,
num_partitions).fillna)
with pytest.raises(ValueError):
from_pandas(test_data.tsframe, num_partitions).fillna(
5, method='ffill'
)
# mixed numeric (but no float16)
mf = test_data.mixed_float.reindex(columns=['A', 'B', 'D'])
mf.loc[mf.index[-10:], 'A'] = np.nan
result = mf.fillna(value=0)
ray_df = from_pandas(mf, num_partitions).fillna(value=0)
assert ray_df_equals_pandas(ray_df, result)
result = mf.fillna(method='pad')
ray_df = from_pandas(mf, num_partitions).fillna(method='pad')
assert ray_df_equals_pandas(ray_df, result)
# TODO: Use this when Arrow issue resolves:
# (https://issues.apache.org/jira/browse/ARROW-2122)
# empty frame (GH #2778)
# df = DataFrame(columns=['x'])
# for m in ['pad', 'backfill']:
# df.x.fillna(method=m, inplace=True)
# df.x.fillna(method=m)
# with different dtype (GH3386)
df = pd.DataFrame([['a', 'a', np.nan, 'a'], [
'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']])
result = df.fillna({2: 'foo'})
ray_df = from_pandas(df, num_partitions).fillna({2: 'foo'})
assert ray_df_equals_pandas(ray_df, result)
ray_df = from_pandas(df, num_partitions)
df.fillna({2: 'foo'}, inplace=True)
ray_df.fillna({2: 'foo'}, inplace=True)
assert ray_df_equals_pandas(ray_df, result)
# limit and value
df = pd.DataFrame(np.random.randn(10, 3))
df.iloc[2:7, 0] = np.nan
df.iloc[3:5, 2] = np.nan
# result = df.fillna(999, limit=1)
# ray_df = from_pandas(df, num_partitions).fillna(999, limit=1)
# assert ray_df_equals_pandas(ray_df, result)
# with datelike
# GH 6344
df = pd.DataFrame({
'Date': [pd.NaT, pd.Timestamp("2014-1-1")],
'Date2': [pd.Timestamp("2013-1-1"), pd.NaT]
})
result = df.fillna(value={'Date': df['Date2']})
ray_df = from_pandas(df, num_partitions).fillna(
value={'Date': df['Date2']}
)
assert ray_df_equals_pandas(ray_df, result)
# TODO: Use this when Arrow issue resolves:
# (https://issues.apache.org/jira/browse/ARROW-2122)
# with timezone
# GH 15855
"""
df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
pd.NaT]})
ray_df = from_pandas(df, num_partitions)
assert ray_df_equals_pandas(ray_df.fillna(method='pad'),
df.fillna(method='pad'))
df = pd.DataFrame({'A': [pd.NaT,
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
ray_df = from_pandas(df, num_partitions).fillna(method='bfill')
assert ray_df_equals_pandas(ray_df, df.fillna(method='bfill'))
"""
@pytest.fixture
def test_fillna_downcast(num_partitions=2):
# GH 15277
# infer int64 from float64
df = pd.DataFrame({'a': [1., np.nan]})
result = df.fillna(0, downcast='infer')
ray_df = from_pandas(df, num_partitions).fillna(0, downcast='infer')
assert ray_df_equals_pandas(ray_df, result)
# infer int64 from float64 when fillna value is a dict
df = pd.DataFrame({'a': [1., np.nan]})
result = df.fillna({'a': 0}, downcast='infer')
ray_df = from_pandas(df, num_partitions).fillna(
{'a': 0}, downcast='infer'
)
assert ray_df_equals_pandas(ray_df, result)
@pytest.fixture
def test_ffill2(num_partitions=2):
test_data = TestData()
test_data.tsframe['A'][:5] = np.nan
test_data.tsframe['A'][-5:] = np.nan
ray_df = from_pandas(test_data.tsframe, num_partitions)
assert ray_df_equals_pandas(
ray_df.fillna(method='ffill'),
test_data.tsframe.fillna(method='ffill')
)
@pytest.fixture
def test_bfill2(num_partitions=2):
test_data = TestData()
test_data.tsframe['A'][:5] = np.nan
test_data.tsframe['A'][-5:] = np.nan
ray_df = from_pandas(test_data.tsframe, num_partitions)
assert ray_df_equals_pandas(
ray_df.fillna(method='bfill'),
test_data.tsframe.fillna(method='bfill')
)
@pytest.fixture
def test_fillna_inplace(num_partitions=2):
df = pd.DataFrame(np.random.randn(10, 4))
df[1][:4] = np.nan
df[3][-4:] = np.nan
ray_df = from_pandas(df, num_partitions)
df.fillna(value=0, inplace=True)
assert not ray_df_equals_pandas(ray_df, df)
ray_df.fillna(value=0, inplace=True)
assert ray_df_equals_pandas(ray_df, df)
ray_df = from_pandas(df, num_partitions).fillna(value={0: 0},
inplace=True)
assert ray_df is None
df[1][:4] = np.nan
df[3][-4:] = np.nan
ray_df = from_pandas(df, num_partitions)
df.fillna(method='ffill', inplace=True)
assert not ray_df_equals_pandas(ray_df, df)
ray_df.fillna(method='ffill', inplace=True)
assert ray_df_equals_pandas(ray_df, df)
@pytest.fixture
def test_frame_fillna_limit(num_partitions=2):
index = np.arange(10)
df = pd.DataFrame(np.random.randn(10, 4), index=index)
expected = df[:2].reindex(index)
expected = expected.fillna(method='pad', limit=5)
ray_df = from_pandas(df[:2].reindex(index), num_partitions).fillna(
method='pad', limit=5
)
assert ray_df_equals_pandas(ray_df, expected)
expected = df[-2:].reindex(index)
expected = expected.fillna(method='backfill', limit=5)
ray_df = from_pandas(df[-2:].reindex(index), num_partitions).fillna(
method='backfill', limit=5
)
assert ray_df_equals_pandas(ray_df, expected)
@pytest.fixture
def test_frame_pad_backfill_limit(num_partitions=2):
index = np.arange(10)
df = pd.DataFrame(np.random.randn(10, 4), index=index)
result = df[:2].reindex(index)
ray_df = from_pandas(result, num_partitions)
assert ray_df_equals_pandas(
ray_df.fillna(method='pad', limit=5),
result.fillna(method='pad', limit=5)
)
result = df[-2:].reindex(index)
ray_df = from_pandas(result, num_partitions)
assert ray_df_equals_pandas(
ray_df.fillna(method='backfill', limit=5),
result.fillna(method='backfill', limit=5)
)
@pytest.fixture
def test_fillna_dtype_conversion(num_partitions=2):
# make sure that fillna on an empty frame works
df = pd.DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
# empty block
df = pd.DataFrame(index=range(3), columns=['A', 'B'], dtype='float64')
ray_df = from_pandas(df, num_partitions)
assert ray_df_equals_pandas(
ray_df.fillna('nan'),
df.fillna('nan')
)
# equiv of replace
df = pd.DataFrame(dict(A=[1, np.nan], B=[1., 2.]))
ray_df = from_pandas(df, num_partitions)
for v in ['', 1, np.nan, 1.0]:
assert ray_df_equals_pandas(
ray_df.fillna(v),
df.fillna(v)
)
@pytest.fixture
def test_fillna_skip_certain_blocks(num_partitions=2):
# don't try to fill boolean, int blocks
df = pd.DataFrame(np.random.randn(10, 4).astype(int))
ray_df = from_pandas(df, num_partitions)
# it works!
assert ray_df_equals_pandas(
ray_df.fillna(np.nan),
df.fillna(np.nan)
)
@pytest.fixture
def test_fillna_dict_series(num_partitions=2):
df = pd.DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
'b': [1, 2, 3, np.nan, np.nan],
'c': [np.nan, 1, 2, 3, 4]})
ray_df = from_pandas(df, num_partitions)
assert ray_df_equals_pandas(
ray_df.fillna({'a': 0, 'b': 5}),
df.fillna({'a': 0, 'b': 5})
)
# it works
assert ray_df_equals_pandas(
ray_df.fillna({'a': 0, 'b': 5, 'd': 7}),
df.fillna({'a': 0, 'b': 5, 'd': 7})
)
# Series treated same as dict
assert ray_df_equals_pandas(
ray_df.fillna(df.max()),
df.fillna(df.max())
)
@pytest.fixture
def test_fillna_dataframe(num_partitions=2):
# GH 8377
df = pd.DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
'b': [1, 2, 3, np.nan, np.nan],
'c': [np.nan, 1, 2, 3, 4]},
index=list('VWXYZ'))
ray_df = from_pandas(df, num_partitions)
# df2 may have different index and columns
df2 = pd.DataFrame({'a': [np.nan, 10, 20, 30, 40],
'b': [50, 60, 70, 80, 90],
'foo': ['bar'] * 5},
index=list('VWXuZ'))
# only those columns and indices which are shared get filled
assert ray_df_equals_pandas(
ray_df.fillna(df2),
df.fillna(df2)
)
@pytest.fixture
def test_fillna_columns(num_partitions=2):
df = pd.DataFrame(np.random.randn(10, 10))
df.values[:, ::2] = np.nan
ray_df = from_pandas(df, num_partitions)
assert ray_df_equals_pandas(
ray_df.fillna(method='ffill', axis=1),
df.fillna(method='ffill', axis=1)
)
df.insert(6, 'foo', 5)
ray_df = from_pandas(df, num_partitions)
assert ray_df_equals_pandas(
ray_df.fillna(method='ffill', axis=1),
df.fillna(method='ffill', axis=1)
)
@pytest.fixture
def test_fillna_invalid_method(num_partitions=2):
test_data = TestData()
ray_df = from_pandas(test_data.frame, num_partitions)
with tm.assert_raises_regex(ValueError, 'ffil'):
ray_df.fillna(method='ffil')
@pytest.fixture
def test_fillna_invalid_value(num_partitions=2):
test_data = TestData()
ray_df = from_pandas(test_data.frame, num_partitions)
# list
pytest.raises(TypeError, ray_df.fillna, [1, 2])
# tuple
pytest.raises(TypeError, ray_df.fillna, (1, 2))
# TODO: Uncomment when iloc is implemented
# frame with series
# pytest.raises(ValueError, ray_df.iloc[:, 0].fillna, ray_df)
@pytest.fixture
def test_fillna_col_reordering(num_partitions=2):
cols = ["COL." + str(i) for i in range(5, 0, -1)]
data = np.random.rand(20, 5)
df = pd.DataFrame(index=range(20), columns=cols, data=data)
ray_df = from_pandas(df, num_partitions)
assert ray_df_equals_pandas(
ray_df.fillna(method='ffill'),
df.fillna(method='ffill')
)
"""
TODO: Use this when Arrow issue resolves:
(https://issues.apache.org/jira/browse/ARROW-2122)
@pytest.fixture
def test_fillna_datetime_columns(num_partitions=2):
# GH 7095
df = pd.DataFrame({'A': [-1, -2, np.nan],
'B': date_range('20130101', periods=3),
'C': ['foo', 'bar', None],
'D': ['foo2', 'bar2', None]},
index=date_range('20130110', periods=3))
ray_df = from_pandas(df, num_partitions)
assert ray_df_equals_pandas(
ray_df.fillna('?'),
df.fillna('?')
)
df = pd.DataFrame({'A': [-1, -2, np.nan],
'B': [pd.Timestamp('2013-01-01'),
pd.Timestamp('2013-01-02'), pd.NaT],
'C': ['foo', 'bar', None],
'D': ['foo2', 'bar2', None]},
index=date_range('20130110', periods=3))
ray_df = from_pandas(df, num_partitions)
assert ray_df_equals_pandas(
ray_df.fillna('?'),
df.fillna('?')
)
"""
def test_filter():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.filter()
def test_first():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.first(None)
@pytest.fixture
def test_first_valid_index(ray_df, pandas_df):
assert(ray_df.first_valid_index() == (pandas_df.first_valid_index()))
def test_floordiv():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.floordiv(None)
def test_from_csv():
with pytest.raises(NotImplementedError):
rdf.DataFrame.from_csv(None)
def test_from_dict():
with pytest.raises(NotImplementedError):
rdf.DataFrame.from_dict(None)
def test_from_items():
with pytest.raises(NotImplementedError):
rdf.DataFrame.from_items(None)
def test_from_records():
with pytest.raises(NotImplementedError):
rdf.DataFrame.from_records(None)
def test_ge():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.ge(None)
def test_get_value():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.get_value(None, None)
def test_get_values():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.get_values()
def test_gt():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.gt(None)
@pytest.fixture
def test_head(ray_df, pandas_df, n=5):
ray_df_equals_pandas(ray_df.head(n), pandas_df.head(n))
def test_hist():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.hist(None)
@pytest.fixture
def test_idxmax(ray_df, pandas_df):
assert \
ray_df.idxmax().sort_index().equals(pandas_df.idxmax().sort_index())
@pytest.fixture
def test_idxmin(ray_df, pandas_df):
assert \
ray_df.idxmin().sort_index().equals(pandas_df.idxmin().sort_index())
def test_infer_objects():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.infer_objects()
@pytest.fixture
def test_info(ray_df):
info_string = ray_df.info()
assert '<class \'ray.dataframe.dataframe.DataFrame\'>\n' in info_string
info_string = ray_df.info(memory_usage=True)
assert 'memory_usage: ' in info_string
@pytest.fixture
def test_insert(ray_df, pandas_df, loc, column, value):
ray_df_cp = ray_df.copy()
pd_df_cp = pandas_df.copy()
ray_df_cp.insert(loc, column, value)
pd_df_cp.insert(loc, column, value)
def test_interpolate():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.interpolate()
@pytest.fixture
def test_items(ray_df, pandas_df):
ray_items = ray_df.items()
pandas_items = pandas_df.items()
for ray_item, pandas_item in zip(ray_items, pandas_items):
ray_index, ray_series = ray_item
pandas_index, pandas_series = pandas_item
assert pandas_series.equals(ray_series)
assert pandas_index == ray_index
@pytest.fixture
def test_iteritems(ray_df, pandas_df):
ray_items = ray_df.iteritems()
pandas_items = pandas_df.iteritems()
for ray_item, pandas_item in zip(ray_items, pandas_items):
ray_index, ray_series = ray_item
pandas_index, pandas_series = pandas_item
assert pandas_series.equals(ray_series)
assert pandas_index == ray_index
@pytest.fixture
def test_iterrows(ray_df, pandas_df):
ray_iterrows = ray_df.iterrows()
pandas_iterrows = pandas_df.iterrows()
for ray_row, pandas_row in zip(ray_iterrows, pandas_iterrows):
ray_index, ray_series = ray_row
pandas_index, pandas_series = pandas_row
assert pandas_series.equals(ray_series)
assert pandas_index == ray_index
@pytest.fixture
def test_itertuples(ray_df, pandas_df):
# test default
ray_it_default = ray_df.itertuples()
pandas_it_default = pandas_df.itertuples()
for ray_row, pandas_row in zip(ray_it_default, pandas_it_default):
np.testing.assert_equal(ray_row, pandas_row)
# test all combinations of custom params
indices = [True, False]
names = [None, 'NotPandas', 'Pandas']
for index in indices:
for name in names:
ray_it_custom = ray_df.itertuples(index=index, name=name)
pandas_it_custom = pandas_df.itertuples(index=index, name=name)
for ray_row, pandas_row in zip(ray_it_custom, pandas_it_custom):
np.testing.assert_equal(ray_row, pandas_row)
def test_join():
ray_df = rdf.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7],
"col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]})
pandas_df = pd.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7],
"col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]})
ray_df2 = rdf.DataFrame({"col5": [0], "col6": [1]})
pandas_df2 = pd.DataFrame({"col5": [0], "col6": [1]})
join_types = ["left", "right", "outer", "inner"]
for how in join_types:
ray_join = ray_df.join(ray_df2, how=how)
pandas_join = pandas_df.join(pandas_df2, how=how)
ray_df_equals_pandas(ray_join, pandas_join)
ray_df3 = rdf.DataFrame({"col7": [1, 2, 3, 5, 6, 7, 8]})
pandas_df3 = pd.DataFrame({"col7": [1, 2, 3, 5, 6, 7, 8]})
join_types = ["left", "outer", "inner"]
for how in join_types:
ray_join = ray_df.join([ray_df2, ray_df3], how=how)
pandas_join = pandas_df.join([pandas_df2, pandas_df3], how=how)
ray_df_equals_pandas(ray_join, pandas_join)
def test_kurt():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.kurt()
def test_kurtosis():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.kurtosis()
def test_last():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.last(None)
@pytest.fixture
def test_last_valid_index(ray_df, pandas_df):
assert(ray_df.last_valid_index() == (pandas_df.last_valid_index()))
def test_le():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.le(None)
def test_lookup():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.lookup(None, None)
def test_lt():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.lt(None)
def test_mad():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.mad()
def test_mask():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.mask(None)
@pytest.fixture
def test_max(ray_df, pandas_df):
assert(ray_series_equals_pandas(ray_df.max(), pandas_df.max()))
assert(ray_series_equals_pandas(ray_df.max(axis=1), pandas_df.max(axis=1)))
@pytest.fixture
def test_mean(ray_df, pandas_df):
assert ray_df.mean().equals(pandas_df.mean())
@pytest.fixture
def test_median(ray_df, pandas_df):
assert(ray_df.median().equals(pandas_df.median()))
def test_melt():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.melt()
@pytest.fixture
def test_memory_usage(ray_df):
assert type(ray_df.memory_usage()) is pd.core.series.Series
assert ray_df.memory_usage(index=True).at['Index'] is not None
assert ray_df.memory_usage(deep=True).sum() >= \
ray_df.memory_usage(deep=False).sum()
def test_merge():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.merge(None)
@pytest.fixture
def test_min(ray_df, pandas_df):
assert(ray_series_equals_pandas(ray_df.min(), pandas_df.min()))
assert(ray_series_equals_pandas(ray_df.min(axis=1), pandas_df.min(axis=1)))
def test_mod():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.mod(None)
def test_mode():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.mode()
def test_mul():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.mul(None)
def test_multiply():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.multiply(None)
def test_ne():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.ne(None)
def test_nlargest():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.nlargest(None, None)
@pytest.fixture
def test_notna(ray_df, pandas_df):
assert(ray_df_equals_pandas(ray_df.notna(), pandas_df.notna()))
@pytest.fixture
def test_notnull(ray_df, pandas_df):
assert(ray_df_equals_pandas(ray_df.notnull(), pandas_df.notnull()))
def test_nsmallest():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.nsmallest(None, None)
def test_nunique():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.nunique()
def test_pct_change():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.pct_change()
def test_pipe():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.pipe(None)
def test_pivot():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.pivot()
def test_pivot_table():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.pivot_table()
def test_plot():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.plot()
@pytest.fixture
def test_pop(ray_df, pandas_df):
temp_ray_df = ray_df.copy()
temp_pandas_df = pandas_df.copy()
ray_popped = temp_ray_df.pop('col2')
pandas_popped = temp_pandas_df.pop('col2')
assert ray_popped.sort_index().equals(pandas_popped.sort_index())
ray_df_equals_pandas(temp_ray_df, temp_pandas_df)
def test_pow():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.pow(None)
def test_prod():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.prod(None)
def test_product():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.product()
@pytest.fixture
def test_quantile(ray_df, pandas_df, q):
assert(ray_df.quantile(q).equals(pandas_df.quantile(q)))
@pytest.fixture
def test_query(ray_df, pandas_df, funcs):
for f in funcs:
pandas_df_new, ray_df_new = pandas_df.query(f), ray_df.query(f)
assert pandas_df_new.equals(to_pandas(ray_df_new))
def test_radd():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.radd(None)
def test_rank():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.rank()
def test_rdiv():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.rdiv(None)
def test_reindex():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.reindex()
def test_reindex_axis():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.reindex_axis(None)
def test_reindex_like():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.reindex_like(None)
# Renaming
def test_rename():
test_rename_sanity()
test_rename_multiindex()
# TODO: Uncomment when __setitem__ is implemented
# test_rename_nocopy()
test_rename_inplace()
test_rename_bug()
@pytest.fixture
def test_rename_sanity(num_partitions=2):
test_data = TestData()
mapping = {
'A': 'a',
'B': 'b',
'C': 'c',
'D': 'd'
}
ray_df = from_pandas(test_data.frame, num_partitions)
assert ray_df_equals_pandas(
ray_df.rename(columns=mapping),
test_data.frame.rename(columns=mapping)
)
renamed2 = test_data.frame.rename(columns=str.lower)
assert ray_df_equals_pandas(
ray_df.rename(columns=str.lower),
renamed2
)
ray_df = from_pandas(renamed2, num_partitions)
assert ray_df_equals_pandas(
ray_df.rename(columns=str.upper),
renamed2.rename(columns=str.upper)
)
# index
data = {
'A': {'foo': 0, 'bar': 1}
}
# gets sorted alphabetical
df = pd.DataFrame(data)
ray_df = from_pandas(df, num_partitions)
tm.assert_index_equal(
ray_df.rename(index={'foo': 'bar', 'bar': 'foo'}).index,
df.rename(index={'foo': 'bar', 'bar': 'foo'}).index
)
tm.assert_index_equal(
ray_df.rename(index=str.upper).index,
df.rename(index=str.upper).index
)
# have to pass something
pytest.raises(TypeError, ray_df.rename)
# partial columns
renamed = test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'})
ray_df = from_pandas(test_data.frame, num_partitions)
tm.assert_index_equal(
ray_df.rename(columns={'C': 'foo', 'D': 'bar'}).index,
test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'}).index
)
# TODO: Uncomment when transpose works
# other axis
# renamed = test_data.frame.T.rename(index={'C': 'foo', 'D': 'bar'})
# tm.assert_index_equal(
# test_data.frame.T.rename(index={'C': 'foo', 'D': 'bar'}).index,
# ray_df.T.rename(index={'C': 'foo', 'D': 'bar'}).index
# )
# index with name
index = pd.Index(['foo', 'bar'], name='name')
renamer = pd.DataFrame(data, index=index)
ray_df = from_pandas(renamer, num_partitions)
renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'})
ray_renamed = ray_df.rename(index={'foo': 'bar', 'bar': 'foo'})
tm.assert_index_equal(
renamed.index, ray_renamed.index
)
assert renamed.index.name == ray_renamed.index.name
@pytest.fixture
def test_rename_multiindex(num_partitions=2):
tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')]
tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')]
index = pd.MultiIndex.from_tuples(tuples_index, names=['foo', 'bar'])
columns = pd.MultiIndex.from_tuples(
tuples_columns, names=['fizz', 'buzz'])
df = pd.DataFrame([(0, 0), (1, 1)], index=index, columns=columns)
ray_df = from_pandas(df, num_partitions)
#
# without specifying level -> accross all levels
renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'},
columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'})
ray_renamed = ray_df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'},
columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'})
tm.assert_index_equal(
renamed.index, ray_renamed.index
)
renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'},
columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'})
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
assert renamed.index.names == ray_renamed.index.names
assert renamed.columns.names == ray_renamed.columns.names
#
# with specifying a level (GH13766)
# dict
renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
level=0)
ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
level=0)
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
level='fizz')
ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
level='fizz')
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
level=1)
ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
level=1)
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
level='buzz')
ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
level='buzz')
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
# function
func = str.upper
renamed = df.rename(columns=func, level=0)
ray_renamed = ray_df.rename(columns=func, level=0)
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
renamed = df.rename(columns=func, level='fizz')
ray_renamed = ray_df.rename(columns=func, level='fizz')
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
renamed = df.rename(columns=func, level=1)
ray_renamed = ray_df.rename(columns=func, level=1)
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
renamed = df.rename(columns=func, level='buzz')
ray_renamed = ray_df.rename(columns=func, level='buzz')
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
# index
renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'},
level=0)
ray_renamed = ray_df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'},
level=0)
tm.assert_index_equal(ray_renamed.index, renamed.index)
@pytest.fixture
def test_rename_nocopy(num_partitions=2):
test_data = TestData().frame
ray_df = from_pandas(test_data, num_partitions)
ray_renamed = ray_df.rename(columns={'C': 'foo'}, copy=False)
ray_renamed['foo'] = 1
assert (ray_df['C'] == 1).all()
@pytest.fixture
def test_rename_inplace(num_partitions=2):
test_data = TestData().frame
ray_df = from_pandas(test_data, num_partitions)
assert ray_df_equals_pandas(
ray_df.rename(columns={'C': 'foo'}),
test_data.rename(columns={'C': 'foo'})
)
frame = test_data.copy()
ray_frame = ray_df.copy()
frame.rename(columns={'C': 'foo'}, inplace=True)
ray_frame.rename(columns={'C': 'foo'}, inplace=True)
assert ray_df_equals_pandas(
ray_frame,
frame
)
@pytest.fixture
def test_rename_bug(num_partitions=2):
# GH 5344
# rename set ref_locs, and set_index was not resetting
df = pd.DataFrame({0: ['foo', 'bar'], 1: ['bah', 'bas'], 2: [1, 2]})
ray_df = from_pandas(df, num_partitions)
df = df.rename(columns={0: 'a'})
df = df.rename(columns={1: 'b'})
# TODO: Uncomment when set_index is implemented
# df = df.set_index(['a', 'b'])
# df.columns = ['2001-01-01']
ray_df = ray_df.rename(columns={0: 'a'})
ray_df = ray_df.rename(columns={1: 'b'})
# TODO: Uncomment when set_index is implemented
# ray_df = ray_df.set_index(['a', 'b'])
# ray_df.columns = ['2001-01-01']
assert ray_df_equals_pandas(
ray_df,
df
)
def test_rename_axis():
test_rename_axis_inplace()
@pytest.fixture
def test_rename_axis_inplace(num_partitions=2):
test_frame = TestData().frame
ray_df = from_pandas(test_frame, num_partitions)
# GH 15704
result = test_frame.copy()
ray_result = ray_df.copy()
no_return = result.rename_axis('foo', inplace=True)
ray_no_return = ray_result.rename_axis('foo', inplace=True)
assert no_return is ray_no_return
assert ray_df_equals_pandas(
ray_result,
result
)
result = test_frame.copy()
ray_result = ray_df.copy()
no_return = result.rename_axis('bar', axis=1, inplace=True)
ray_no_return = ray_result.rename_axis('bar', axis=1, inplace=True)
assert no_return is ray_no_return
assert ray_df_equals_pandas(
ray_result,
result
)
def test_reorder_levels():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.reorder_levels(None)
def test_replace():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.replace()
def test_resample():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.resample(None)
@pytest.fixture
def test_reset_index(ray_df, pandas_df, inplace=False):
if not inplace:
assert to_pandas(ray_df.reset_index(inplace=inplace)).equals(
pandas_df.reset_index(inplace=inplace))
else:
ray_df_cp = ray_df.copy()
pd_df_cp = pandas_df.copy()
ray_df_cp.reset_index(inplace=inplace)
pd_df_cp.reset_index(inplace=inplace)
assert to_pandas(ray_df_cp).equals(pd_df_cp)
def test_rfloordiv():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.rfloordiv(None)
def test_rmod():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.rmod(None)
def test_rmul():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.rmul(None)
def test_rolling():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.rolling(None)
@pytest.fixture
def test_round(ray_df, pd_df):
assert ray_df_equals_pandas(ray_df.round(), pd_df.round())
assert ray_df_equals_pandas(ray_df.round(1), pd_df.round(1))
def test_rpow():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.rpow(None)
def test_rsub():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.rsub(None)
def test_rtruediv():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.rtruediv(None)
def test_sample():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.sample()
def test_select():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.select(None)
def test_select_dtypes():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.select_dtypes()
def test_sem():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.sem()
@pytest.fixture
def test_set_axis(ray_df, pandas_df, label, axis):
assert to_pandas(ray_df.set_axis(label, axis, inplace=False)).equals(
pandas_df.set_axis(label, axis, inplace=False))
@pytest.fixture
def test_set_index(ray_df, pandas_df, keys, inplace=False):
if not inplace:
assert to_pandas(ray_df.set_index(keys)).equals(
pandas_df.set_index(keys))
else:
ray_df_cp = ray_df.copy()
pd_df_cp = pandas_df.copy()
ray_df_cp.set_index(keys, inplace=inplace)
pd_df_cp.set_index(keys, inplace=inplace)
assert to_pandas(ray_df_cp).equals(pd_df_cp)
def test_set_value():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.set_value(None, None, None)
def test_shift():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.shift()
def test_skew():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.skew()
def test_slice_shift():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.slice_shift()
def test_sort_index():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.sort_index()
def test_sort_values():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.sort_values(None)
def test_sortlevel():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.sortlevel()
def test_squeeze():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.squeeze()
def test_stack():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.stack()
@pytest.fixture
def test_std(ray_df, pandas_df):
assert(ray_df.std().equals(pandas_df.std()))
def test_sub():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.sub(None)
def test_subtract():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.subtract(None)
def test_swapaxes():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.swapaxes(None, None)
def test_swaplevel():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.swaplevel()
@pytest.fixture
def test_tail(ray_df, pandas_df):
ray_df_equals_pandas(ray_df.tail(), pandas_df.tail())
def test_take():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.take(None)
def test_to_records():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.to_records()
def test_to_sparse():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.to_sparse()
def test_to_string():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.to_string()
def test_to_timestamp():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.to_timestamp()
def test_to_xarray():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.to_xarray()
@pytest.fixture
def test_transform(ray_df, pandas_df):
ray_df_equals_pandas(ray_df.transform(lambda df: df.isna()),
pandas_df.transform(lambda df: df.isna()))
ray_df_equals_pandas(ray_df.transform('isna'),
pandas_df.transform('isna'))
def test_truediv():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.truediv(None)
def test_truncate():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.truncate()
def test_tshift():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.tshift()
def test_tz_convert():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.tz_convert(None)
def test_tz_localize():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.tz_localize(None)
def test_unstack():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.unstack()
def test_update():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.update(None)
@pytest.fixture
def test_var(ray_df, pandas_df):
assert(ray_df.var().equals(pandas_df.var()))
def test_where():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.where(None)
def test_xs():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.xs(None)
@pytest.fixture
def test___getitem__(ray_df, pd_df):
ray_col = ray_df.__getitem__('col1')
assert isinstance(ray_col, pd.Series)
pd_col = pd_df['col1']
assert pd_col.equals(ray_col)
def test___getattr__():
df = create_test_dataframe()
col = df.__getattr__("col1")
assert isinstance(col, pd.Series)
col = getattr(df, "col1")
assert isinstance(col, pd.Series)
col = df.col1
assert isinstance(col, pd.Series)
# Check that lookup in column doesn't override other attributes
df2 = df.rename(index=str, columns={"col5": "columns"})
assert isinstance(df2.columns, pd.Index)
def test___setitem__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__setitem__(None, None)
@pytest.fixture
def test___len__(ray_df, pandas_df):
assert((len(ray_df) == len(pandas_df)))
def test___unicode__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__unicode__()
@pytest.fixture
def test___neg__(ray_df, pd_df):
ray_df_neg = ray_df.__neg__()
assert pd_df.__neg__().equals(to_pandas(ray_df_neg))
def test___invert__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__invert__()
def test___hash__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__hash__()
@pytest.fixture
def test___iter__(ray_df, pd_df):
ray_iterator = ray_df.__iter__()
# Check that ray_iterator implements the iterator interface
assert hasattr(ray_iterator, '__iter__')
assert hasattr(ray_iterator, 'next') or hasattr(ray_iterator, '__next__')
pd_iterator = pd_df.__iter__()
assert list(ray_iterator) == list(pd_iterator)
@pytest.fixture
def test___contains__(ray_df, key, result):
assert result == ray_df.__contains__(key)
assert result == (key in ray_df)
def test___nonzero__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__nonzero__()
def test___bool__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__bool__()
@pytest.fixture
def test___abs__(ray_df, pandas_df):
assert(ray_df_equals_pandas(abs(ray_df), abs(pandas_df)))
def test___round__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__round__()
@pytest.fixture
def test___array__(ray_df, pandas_df):
assert np.array_equal(ray_df.__array__(), pandas_df.__array__())
def test___array_wrap__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__array_wrap__(None)
def test___getstate__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__getstate__()
def test___setstate__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__setstate__(None)
@pytest.fixture
def test___delitem__(ray_df, pd_df):
ray_df = ray_df.copy()
pd_df = pd_df.copy()
ray_df.__delitem__('col1')
pd_df.__delitem__('col1')
ray_df_equals_pandas(ray_df, pd_df)
def test___finalize__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__finalize__(None)
@pytest.fixture
def test___copy__(ray_df, pd_df):
ray_df_copy, pd_df_copy = ray_df.__copy__(), pd_df.__copy__()
assert ray_df_equals_pandas(ray_df_copy, pd_df_copy)
@pytest.fixture
def test___deepcopy__(ray_df, pd_df):
ray_df_copy, pd_df_copy = ray_df.__deepcopy__(), pd_df.__deepcopy__()
assert ray_df_equals_pandas(ray_df_copy, pd_df_copy)
def test_blocks():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.blocks
def test_style():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.style
def test_iat():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.iat()
def test___rsub__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__rsub__(None, None, None)
@pytest.fixture
def test_loc(ray_df, pd_df):
# Singleton
assert ray_df.loc[0].equals(pd_df.loc[0])
assert ray_df.loc[0, 'col1'] == pd_df.loc[0, 'col1']
# List
assert ray_df.loc[[1, 2]].equals(pd_df.loc[[1, 2]])
assert ray_df.loc[[1, 2], ['col1']].equals(pd_df.loc[[1, 2], ['col1']])
# Slice
assert ray_df.loc[1:, 'col1'].equals(pd_df.loc[1:, 'col1'])
assert ray_df.loc[1:2, 'col1'].equals(pd_df.loc[1:2, 'col1'])
assert ray_df.loc[1:2, 'col1':'col2'].equals(pd_df.loc[1:2, 'col1':'col2'])
def test_is_copy():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.is_copy
def test___itruediv__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__itruediv__()
def test___div__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__div__(None)
def test_at():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.at()
def test_ix():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.ix()
@pytest.fixture
def test_iloc(ray_df, pd_df):
# Singleton
assert ray_df.iloc[0].equals(pd_df.iloc[0])
assert ray_df.iloc[0, 1] == pd_df.iloc[0, 1]
# List
assert ray_df.iloc[[1, 2]].equals(pd_df.iloc[[1, 2]])
assert ray_df.iloc[[1, 2], [1, 0]].equals(pd_df.iloc[[1, 2], [1, 0]])
# Slice
assert ray_df.iloc[1:, 0].equals(pd_df.iloc[1:, 0])
assert ray_df.iloc[1:2, 0].equals(pd_df.iloc[1:2, 0])
assert ray_df.iloc[1:2, 0:2].equals(pd_df.iloc[1:2, 0:2])
def test__doc__():
assert rdf.DataFrame.__doc__ != pd.DataFrame.__doc__
assert rdf.DataFrame.__init__ != pd.DataFrame.__init__
for attr, obj in rdf.DataFrame.__dict__.items():
if (callable(obj) or isinstance(obj, property)) \
and attr != "__init__":
pd_obj = getattr(pd.DataFrame, attr, None)
if callable(pd_obj) or isinstance(pd_obj, property):
assert obj.__doc__ == pd_obj.__doc__