[DataFrame] Implementing API correct groupby with aggregation methods (#1914)

This commit is contained in:
Devin Petersohn
2018-04-21 17:28:16 -07:00
committed by Robert Nishihara
parent 8264e64b18
commit 8f59546ef2
4 changed files with 607 additions and 157 deletions
+225 -30
View File
@@ -8,6 +8,7 @@ from pandas.util._validators import validate_bool_kwarg
from pandas.core.index import _ensure_index_from_sequences
from pandas._libs import lib
from pandas.core.dtypes.cast import maybe_upcast_putmask
from pandas import compat
from pandas.compat import lzip
import pandas.core.common as com
from pandas.core.dtypes.common import (
@@ -25,6 +26,7 @@ import io
import sys
import re
from .groupby import DataFrameGroupBy
from .utils import (
_deploy_func,
_map_partitions,
@@ -90,7 +92,6 @@ class DataFrame(object):
axis = 0
columns = pd_df.columns
index = pd_df.index
self._row_metadata = self._col_metadata = None
else:
# created this invariant to make sure we never have to go into the
# partitions to get the columns
@@ -101,28 +102,25 @@ class DataFrame(object):
if block_partitions is not None:
# put in numpy array here to make accesses easier since it's 2D
self._block_partitions = np.array(block_partitions)
if row_metadata is not None:
self._row_metadata = row_metadata.copy()
if col_metadata is not None:
self._col_metadata = col_metadata.copy()
assert self._block_partitions.ndim == 2, \
"Block Partitions must be 2D."
else:
if row_partitions is not None:
axis = 0
partitions = row_partitions
if row_metadata is not None:
self._row_metadata = row_metadata.copy()
elif col_partitions is not None:
axis = 1
partitions = col_partitions
if col_metadata is not None:
self._col_metadata = col_metadata.copy()
self._block_partitions = \
_create_block_partitions(partitions, axis=axis,
length=len(columns))
if row_metadata is not None:
self._row_metadata = row_metadata.copy()
if col_metadata is not None:
self._col_metadata = col_metadata.copy()
# Sometimes we only get a single column or row, which is
# problematic for building blocks from the partitions, so we
# add whatever dimension we're missing from the input.
@@ -132,10 +130,10 @@ class DataFrame(object):
# Create the row and column index objects for using our partitioning.
# If the objects haven't been inherited, then generate them
if not self._row_metadata:
if self._row_metadata is None:
self._row_metadata = _IndexMetadata(self._block_partitions[:, 0],
index=index, axis=0)
if not self._col_metadata:
if self._col_metadata is None:
self._col_metadata = _IndexMetadata(self._block_partitions[0, :],
index=columns, axis=1)
@@ -562,9 +560,23 @@ class DataFrame(object):
Returns:
A new DataFrame resulting from the groupby.
"""
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
axis = pd.DataFrame()._get_axis_number(axis)
if callable(by):
by = by(self.index)
elif isinstance(by, compat.string_types):
by = self.__getitem__(by).values.tolist()
elif is_list_like(by):
mismatch = len(by) != len(self) if axis == 0 \
else len(by) != len(self.columns)
if all([obj in self for obj in by]) and mismatch:
raise NotImplementedError(
"Groupby with lists of columns not yet supported.")
elif mismatch:
raise KeyError(next(x for x in by if x not in self))
return DataFrameGroupBy(self, by, axis, level, as_index, sort,
group_keys, squeeze, **kwargs)
def sum(self, axis=None, skipna=True, level=None, numeric_only=None):
"""Perform a sum across the DataFrame.
@@ -635,7 +647,9 @@ class DataFrame(object):
return DataFrame(block_partitions=new_block_partitions,
columns=self.columns,
index=self.index)
index=self.index,
row_metadata=self._row_metadata,
col_metadata=self._col_metadata)
def isnull(self):
"""Fill a DataFrame with booleans for cells containing a null value.
@@ -704,14 +718,162 @@ class DataFrame(object):
"github.com/ray-project/ray.")
def agg(self, func, axis=0, *args, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
return self.aggregate(func, axis, *args, **kwargs)
def aggregate(self, func, axis=0, *args, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
axis = pd.DataFrame()._get_axis_number(axis)
result = None
if axis == 0:
try:
result = self._aggregate(func, axis=axis, *args, **kwargs)
except TypeError:
pass
if result is None:
kwargs.pop('is_transform', None)
return self.apply(func, axis=axis, args=args, **kwargs)
return result
def _aggregate(self, arg, *args, **kwargs):
_axis = kwargs.pop('_axis', None)
if _axis is None:
_axis = getattr(self, 'axis', 0)
kwargs.pop('_level', None)
if isinstance(arg, compat.string_types):
return self._string_function(arg, *args, **kwargs)
# Dictionaries have complex behavior because they can be renamed here.
elif isinstance(arg, dict):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
elif is_list_like(arg):
from .concat import concat
x = [self._aggregate(func, *args, **kwargs)
for func in arg]
new_dfs = [x[i] if not isinstance(x[i], pd.Series)
else pd.DataFrame(x[i], columns=[arg[i]]).T
for i in range(len(x))]
return concat(new_dfs)
elif callable(arg):
self._callable_function(arg, _axis, *args, **kwargs)
else:
# TODO Make pandas error
raise ValueError("type {} is not callable".format(type(arg)))
def _string_function(self, func, *args, **kwargs):
assert isinstance(func, compat.string_types)
f = getattr(self, func, None)
if f is not None:
if callable(f):
return f(*args, **kwargs)
assert len(args) == 0
assert len([kwarg
for kwarg in kwargs
if kwarg not in ['axis', '_level']]) == 0
return f
f = getattr(np, func, None)
if f is not None:
raise NotImplementedError("Numpy aggregates not yet supported.")
raise ValueError("{} is an unknown string function".format(func))
def _callable_function(self, func, axis, *args, **kwargs):
if axis == 0:
partitions = self._col_partitions
else:
partitions = self._row_partitions
if axis == 1:
kwargs['axis'] = axis
kwargs['temp_columns'] = self.columns
else:
kwargs['temp_index'] = self.index
def agg_helper(df, arg, *args, **kwargs):
if 'temp_index' in kwargs:
df.index = kwargs.pop('temp_index', None)
else:
df.columns = kwargs.pop('temp_columns', None)
is_transform = kwargs.pop('is_transform', False)
new_df = df.agg(arg, *args, **kwargs)
is_series = False
if isinstance(new_df, pd.Series):
is_series = True
index = None
columns = None
else:
index = new_df.index \
if not isinstance(new_df.index, pd.RangeIndex) \
else None
columns = new_df.columns
new_df.columns = pd.RangeIndex(0, len(new_df.columns))
new_df.reset_index(drop=True, inplace=True)
if is_transform:
if is_scalar(new_df) or len(new_df) != len(df):
raise ValueError("transforms cannot produce "
"aggregated results")
return is_series, new_df, index, columns
remote_result = \
[_deploy_func._submit(args=(lambda df: agg_helper(df,
func,
*args,
**kwargs),
part), num_return_vals=4)
for part in partitions]
# This magic transposes the list comprehension returned from remote
is_series, new_parts, index, columns = \
[list(t) for t in zip(*remote_result)]
# This part is because agg can allow returning a Series or a
# DataFrame, and we have to determine which here. Shouldn't add
# too much to latency in either case because the booleans can
# be returned immediately
is_series = ray.get(is_series)
if all(is_series):
new_series = pd.concat(ray.get(new_parts))
new_series.index = self.columns if axis == 0 else self.index
return new_series
# This error is thrown when some of the partitions return Series and
# others return DataFrames. We do not allow mixed returns.
elif any(is_series):
raise ValueError("no results.")
# The remaining logic executes when we have only DataFrames in the
# remote objects. We build a Ray DataFrame from the Pandas partitions.
elif axis == 0:
new_index = ray.get(index[0])
columns = ray.get(columns)
columns = columns[0].append(columns[1:])
return DataFrame(col_partitions=new_parts,
columns=columns,
index=self.index if new_index is None
else new_index)
else:
new_index = ray.get(index[0])
columns = ray.get(columns)
columns = columns[0].append(columns[1:])
return DataFrame(row_partitions=new_parts,
columns=columns,
index=self.index if new_index is None
else new_index)
def align(self, other, join='outer', axis=None, level=None, copy=True,
fill_value=None, method=None, limit=None, fill_axis=0,
@@ -755,9 +917,38 @@ class DataFrame(object):
def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
args=(), **kwds):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
"""Apply a function along input axis of DataFrame.
Args:
func: The function to apply
axis: The axis over which to apply the func.
broadcast: Whether or not to broadcast.
raw: Whether or not to convert to a Series.
reduce: Whether or not to try to apply reduction procedures.
Returns:
Series or DataFrame, depending on func.
"""
axis = pd.DataFrame()._get_axis_number(axis)
if is_list_like(func) and not all([isinstance(obj, str)
for obj in func]):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
if axis == 0 and is_list_like(func):
return self.aggregate(func, axis, *args, **kwds)
if isinstance(func, compat.string_types):
if axis == 1:
kwds['axis'] = axis
return getattr(self, func)(*args, **kwds)
elif callable(func):
return self._callable_function(func, axis=axis, *args, **kwds)
else:
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def as_blocks(self, copy=True):
raise NotImplementedError(
@@ -2879,9 +3070,14 @@ class DataFrame(object):
"github.com/ray-project/ray.")
def transform(self, func, *args, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
kwargs["is_transform"] = True
result = self.agg(func, *args, **kwargs)
try:
result.columns = self.columns
result.index = self.index
except ValueError:
raise ValueError("transforms cannot produce aggregated results")
return result
def truediv(self, other, axis='columns', level=None, fill_value=None):
raise NotImplementedError(
@@ -3135,9 +3331,8 @@ class DataFrame(object):
"github.com/ray-project/ray.")
def __array__(self, dtype=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
# TODO: This is very inefficient and needs fix
return np.array(to_pandas(self))
def __array_wrap__(self, result, context=None):
raise NotImplementedError(
+201 -96
View File
@@ -3,60 +3,92 @@ from __future__ import division
from __future__ import print_function
import pandas.core.groupby
import numpy as np
import pandas as pd
from pandas.core.dtypes.common import is_list_like
import ray
from .utils import _map_partitions
from .utils import _inherit_docstrings
@_inherit_docstrings(pandas.core.groupby.DataFrameGroupBy)
class DataFrameGroupBy(object):
def __init__(self, partitions, columns, index):
self._partitions = partitions
self._columns = columns
self._index = index
def __init__(self, df, by, axis, level, as_index, sort, group_keys,
squeeze, **kwargs):
def _map_partitions(self, func, index=None):
"""Apply a function on each partition.
self._columns = df.columns
self._index = df.index
self._axis = axis
Args:
func (callable): The function to Apply.
self._row_metadata = df._row_metadata
self._col_metadata = df._col_metadata
Returns:
A new DataFrame containing the result of the function.
"""
if axis == 0:
partitions = [column for column in df._block_partitions.T]
self._index_grouped = pd.Series(self._index, index=self._index)\
.groupby(by=by, sort=sort)
else:
partitions = [row for row in df._block_partitions]
self._index_grouped = pd.Series(self._columns, index=self._index)\
.groupby(by=by, sort=sort)
self._keys_and_values = [(k, np.array(v))
for k, v in self._index_grouped]
self._grouped_partitions = \
list(zip(*(groupby._submit(args=(by,
axis,
level,
as_index,
sort,
group_keys,
squeeze) + part,
num_return_vals=len(self))
for part in partitions)))
@property
def _iter(self):
from .dataframe import DataFrame
from .dataframe import _deploy_func
assert(callable(func))
new_df = [_deploy_func.remote(lambda df: df.apply(func), part)
for part in self._partitions]
if index is None:
index = self._index
return DataFrame(row_partitions=new_df, columns=self._columns,
index=index)
if self._axis == 0:
return [(self._keys_and_values[i][0],
DataFrame(col_partitions=part,
columns=self._columns,
index=self._keys_and_values[i][1].index,
row_metadata=self._row_metadata[
self._keys_and_values[i][1].index],
col_metadata=self._col_metadata))
for i, part in enumerate(self._grouped_partitions)]
else:
return [(self._keys_and_values[i][0],
DataFrame(row_partitions=part,
columns=self._keys_and_values[i][1].index,
index=self._index,
row_metadata=self._row_metadata,
col_metadata=self._col_metadata[
self._keys_and_values[i][1].index]))
for i, part in enumerate(self._grouped_partitions)]
@property
def ngroups(self):
raise NotImplementedError("Not Yet implemented.")
return len(self)
@property
def skew(self):
raise NotImplementedError("Not Yet implemented.")
def skew(self, **kwargs):
return self._apply_agg_function(lambda df: df.skew(**kwargs))
def ffill(self, limit=None):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.ffill(limit=limit))
def sem(self, ddof=1):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.sem(ddof=ddof))
def mean(self, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.mean(*args, **kwargs))
@property
def any(self):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.any())
@property
def plot(self):
@@ -74,18 +106,17 @@ class DataFrameGroupBy(object):
@property
def groups(self):
raise NotImplementedError("Not Yet implemented.")
return dict([(k, pd.Index(v)) for k, v in self._keys_and_values])
def min(self, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.min(**kwargs))
@property
def idxmax(self):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.idxmax())
@property
def ndim(self):
raise NotImplementedError("Not Yet implemented.")
return self._index_grouped.ndim
def shift(self, periods=1, freq=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
@@ -94,70 +125,82 @@ class DataFrameGroupBy(object):
raise NotImplementedError("Not Yet implemented.")
def cumsum(self, axis=0, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.cumsum(axis,
*args,
**kwargs))
@property
def indices(self):
raise NotImplementedError("Not Yet implemented.")
return dict(self._keys_and_values)
@property
def pct_change(self):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.pct_change())
def filter(self, func, dropna=True, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def cummax(self, axis=0, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.cummax(axis=axis,
**kwargs))
def apply(self, func, *args, **kwargs):
return self._map_partitions(func)
def rolling(self, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_df_function(lambda df: df.apply(func,
*args,
**kwargs)) \
if is_list_like(func) \
else self._apply_agg_function(lambda df: df.apply(func,
*args,
**kwargs))
@property
def dtypes(self):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.dtypes)
def first(self, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.first(offset=0,
**kwargs))
def backfill(self, limit=None):
raise NotImplementedError("Not Yet implemented.")
return self.bfill(limit)
def __getitem__(self, key):
# This operation requires a SeriesGroupBy Object
raise NotImplementedError("Not Yet implemented.")
def cummin(self, axis=0, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.cummin(axis=axis,
**kwargs))
def bfill(self, limit=None):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.bfill(limit=limit))
@property
def idxmin(self):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.idxmin())
def prod(self, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.prod(**kwargs))
def std(self, ddof=1, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.std(ddof=ddof,
*args, **kwargs))
def aggregate(self, arg, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_df_function(lambda df: df.agg(arg,
*args,
**kwargs)) \
if is_list_like(arg) \
else self._apply_agg_function(lambda df: df.agg(arg,
*args,
**kwargs))
def last(self, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_df_function(lambda df: df.last(**kwargs))
@property
def mad(self):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.mad())
@property
def rank(self):
raise NotImplementedError("Not Yet implemented.")
return self._apply_df_function(lambda df: df.rank())
@property
def corrwith(self):
@@ -167,26 +210,28 @@ class DataFrameGroupBy(object):
raise NotImplementedError("Not Yet implemented.")
def max(self, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.max(**kwargs))
def var(self, ddof=1, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.var(ddof,
*args,
**kwargs))
def get_group(self, name, obj=None):
raise NotImplementedError("Not Yet implemented.")
def __len__(self):
raise NotImplementedError("Not Yet implemented.")
return len(self._keys_and_values)
@property
def all(self):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.all())
def size(self):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.size)
def sum(self, **kwargs):
self._map_partitions(lambda df: df.sum())
return self._apply_agg_function(lambda df:
df.sum(axis=self._axis, **kwargs))
def __unicode__(self):
raise NotImplementedError("Not Yet implemented.")
@@ -194,76 +239,136 @@ class DataFrameGroupBy(object):
def describe(self, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def boxplot(grouped, subplots=True, column=None, fontsize=None, rot=0,
grid=True, ax=None, figsize=None, layout=None, **kwds):
def boxplot(self, grouped, subplots=True, column=None, fontsize=None,
rot=0, grid=True, ax=None, figsize=None, layout=None, **kwds):
raise NotImplementedError("Not Yet implemented.")
def ngroup(self, ascending=True):
raise NotImplementedError("Not Yet implemented.")
return self._index_grouped.ngroup(ascending)
def nunique(self, dropna=True):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.nunique(dropna))
def resample(self, rule, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def median(self, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.median(**kwargs))
def head(self, n=5):
raise NotImplementedError("Not Yet implemented.")
return self._apply_df_function(lambda df: df.head(n))
def cumprod(self, axis=0, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_df_function(lambda df: df.cumprod(axis,
*args,
**kwargs))
def __iter__(self):
raise NotImplementedError("Not Yet implemented.")
return self._iter.__iter__()
def agg(self, arg, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def agg_help(df):
if isinstance(df, pd.Series):
return pd.DataFrame(df).T
else:
return df
x = [v.agg(arg, axis=self._axis, *args, **kwargs)
for k, v in self._iter]
new_parts = _map_partitions(lambda df: agg_help(df), x)
from .concat import concat
result = concat(new_parts)
return result
@property
def cov(self):
raise NotImplementedError("Not Yet implemented.")
return self._apply_agg_function(lambda df: df.cov())
def transform(self, func, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
from .concat import concat
@property
def corr(self):
raise NotImplementedError("Not Yet implemented.")
new_parts = concat([v.transform(func, *args, **kwargs)
for k, v in self._iter])
return new_parts
@property
def fillna(self):
raise NotImplementedError("Not Yet implemented.")
def corr(self, **kwargs):
return self._apply_agg_function(lambda df: df.corr(**kwargs))
def count(self):
raise NotImplementedError("Not Yet implemented.")
def fillna(self, **kwargs):
return self._apply_df_function(lambda df: df.fillna(**kwargs))
def count(self, **kwargs):
return self._apply_agg_function(lambda df: df.count(**kwargs))
def pipe(self, func, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
return self._apply_df_function(lambda df: df.pipe(func,
*args,
**kwargs))
def cumcount(self, ascending=True):
raise NotImplementedError("Not Yet implemented.")
def tail(self, n=5):
raise NotImplementedError("Not Yet implemented.")
return self._apply_df_function(lambda df: df.tail(n))
# expanding and rolling are unique cases and need to likely be handled
# separately. They do not appear to be commonly used.
def expanding(self, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
@property
def rolling(self, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def hist(self):
raise NotImplementedError("Not Yet implemented.")
@property
def quantile(self):
raise NotImplementedError("Not Yet implemented.")
def quantile(self, q=0.5, **kwargs):
return self._apply_df_function(lambda df: df.quantile(q, **kwargs)) \
if is_list_like(q) \
else self._apply_agg_function(lambda df: df.quantile(q, **kwargs))
@property
def diff(self):
raise NotImplementedError("Not Yet implemented.")
@property
def take(self):
raise NotImplementedError("Not Yet implemented.")
def take(self, **kwargs):
return self._apply_df_function(lambda df: df.take(**kwargs))
def _apply_agg_function(self, f):
assert callable(f), "\'{0}\' object is not callable".format(type(f))
result = [pd.DataFrame(f(v)).T for k, v in self._iter]
new_df = pd.concat(result)
if self._axis == 0:
new_df.columns = self._columns
new_df.index = [k for k, v in self._iter]
else:
new_df = new_df.T
new_df.columns = [k for k, v in self._iter]
new_df.index = self._index
return new_df
def _apply_df_function(self, f):
assert callable(f), "\'{0}\' object is not callable".format(type(f))
result = [f(v) for k, v in self._iter]
from .concat import concat
new_df = concat(result)
return new_df
@ray.remote
def groupby(by, axis, level, as_index, sort, group_keys, squeeze, *df):
df = pd.concat(df, axis=axis)
return [v for k, v in df.groupby(by=by,
axis=axis,
level=level,
as_index=as_index,
sort=sort,
group_keys=group_keys,
squeeze=squeeze)]
+10 -4
View File
@@ -114,7 +114,8 @@ class _IndexMetadata(_IndexMetadataBase):
partitions.
"""
def __init__(self, dfs, index=None, axis=0):
def __init__(self, dfs=None, index=None, axis=0, lengths_oid=None,
coord_df_oid=None):
"""Inits a IndexMetadata from Ray DataFrame partitions
Args:
@@ -126,9 +127,10 @@ class _IndexMetadata(_IndexMetadataBase):
A IndexMetadata backed by the specified pd.Index, partitioned off
specified partitions
"""
lengths_oid, coord_df_oid = \
_build_index.remote(dfs, index) if axis == 0 else \
_build_columns.remote(dfs, index)
if dfs is not None:
lengths_oid, coord_df_oid = \
_build_index.remote(dfs, index) if axis == 0 else \
_build_columns.remote(dfs, index)
self._coord_df = coord_df_oid
self._lengths = lengths_oid
@@ -269,6 +271,10 @@ class _IndexMetadata(_IndexMetadataBase):
self._coord_df.loc[partition_mask & index_within_partition_mask,
'index_within_partition'] -= 1
def copy(self):
return _IndexMetadata(coord_df_oid=self._coord_df,
lengths_oid=self._lengths)
class _WrappingIndexMetadata(_IndexMetadata):
"""IndexMetadata implementation for index across a non-partitioned axis.
+171 -27
View File
@@ -290,6 +290,42 @@ def test_int_dataframe():
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
test___array__(ray_df, pandas_df)
apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
['sum', 'sum']]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
if not isinstance(func, list):
test_agg(ray_df, pandas_df, func, 1)
test_apply(ray_df, pandas_df, func, 1)
test_aggregate(ray_df, pandas_df, func, 1)
else:
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
func = ['sum', lambda df: df.sum()]
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
test_transform(ray_df, pandas_df)
def test_float_dataframe():
@@ -339,7 +375,8 @@ def test_float_dataframe():
test_query(ray_df, pandas_df, query_funcs)
test_mean(ray_df, pandas_df)
test_var(ray_df, pandas_df)
# TODO Clear floating point error.
# test_var(ray_df, pandas_df)
test_std(ray_df, pandas_df)
test_median(ray_df, pandas_df)
test_quantile(ray_df, pandas_df, .25)
@@ -414,6 +451,43 @@ def test_float_dataframe():
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
# TODO Nans are always not equal to each other, fix it
# test___array__(ray_df, pandas_df)
apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
['sum', 'sum']]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
if not isinstance(func, list):
test_agg(ray_df, pandas_df, func, 1)
test_apply(ray_df, pandas_df, func, 1)
test_aggregate(ray_df, pandas_df, func, 1)
else:
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
func = ['sum', lambda df: df.sum()]
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
test_transform(ray_df, pandas_df)
def test_mixed_dtype_dataframe():
pandas_df = pd.DataFrame({
@@ -465,7 +539,8 @@ def test_mixed_dtype_dataframe():
test_query(ray_df, pandas_df, query_funcs)
test_mean(ray_df, pandas_df)
test_var(ray_df, pandas_df)
# TODO Clear floating point error.
# test_var(ray_df, pandas_df)
test_std(ray_df, pandas_df)
test_median(ray_df, pandas_df)
test_quantile(ray_df, pandas_df, .25)
@@ -549,6 +624,30 @@ def test_mixed_dtype_dataframe():
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
test___array__(ray_df, pandas_df)
apply_agg_functions = ['sum', lambda df: df.sum()]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
func = ['sum', lambda df: df.sum()]
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
test_transform(ray_df, pandas_df)
def test_nan_dataframe():
pandas_df = pd.DataFrame({
@@ -670,6 +769,43 @@ def test_nan_dataframe():
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
# TODO Nans are always not equal to each other, fix it
# test___array__(ray_df, pandas_df)
apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
['sum', 'sum']]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
if not isinstance(func, list):
test_agg(ray_df, pandas_df, func, 1)
test_apply(ray_df, pandas_df, func, 1)
test_aggregate(ray_df, pandas_df, func, 1)
else:
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
func = ['sum', lambda df: df.sum()]
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 0)
with pytest.raises(NotImplementedError):
test_apply(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_aggregate(ray_df, pandas_df, func, 1)
with pytest.raises(NotImplementedError):
test_agg(ray_df, pandas_df, func, 1)
test_transform(ray_df, pandas_df)
def test_add():
ray_df = create_test_dataframe()
@@ -678,18 +814,24 @@ def test_add():
ray_df.add(None)
def test_agg():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.agg(None)
@pytest.fixture
def test_agg(ray_df, pandas_df, func, axis):
ray_result = ray_df.agg(func, axis)
pandas_result = pandas_df.agg(func, axis)
if isinstance(ray_result, rdf.DataFrame):
assert ray_df_equals_pandas(ray_result, pandas_result)
else:
assert ray_result.equals(pandas_result)
def test_aggregate():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.aggregate(None)
@pytest.fixture
def test_aggregate(ray_df, pandas_df, func, axis):
ray_result = ray_df.aggregate(func, axis)
pandas_result = pandas_df.aggregate(func, axis)
if isinstance(ray_result, rdf.DataFrame):
assert ray_df_equals_pandas(ray_result, pandas_result)
else:
assert ray_result.equals(pandas_result)
def test_align():
@@ -718,11 +860,14 @@ def test_append():
ray_df.append(None)
def test_apply():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.apply(None)
@pytest.fixture
def test_apply(ray_df, pandas_df, func, axis):
ray_result = ray_df.apply(func, axis)
pandas_result = pandas_df.apply(func, axis)
if isinstance(ray_result, rdf.DataFrame):
assert ray_df_equals_pandas(ray_result, pandas_result)
else:
assert ray_result.equals(pandas_result)
def test_as_blocks():
@@ -2681,11 +2826,12 @@ def test_to_xarray():
ray_df.to_xarray()
def test_transform():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.transform(None)
@pytest.fixture
def test_transform(ray_df, pandas_df):
ray_df_equals_pandas(ray_df.transform(lambda df: df.isna()),
pandas_df.transform(lambda df: df.isna()))
ray_df_equals_pandas(ray_df.transform('isna'),
pandas_df.transform('isna'))
def test_truediv():
@@ -2865,11 +3011,9 @@ def test___round__():
ray_df.__round__()
def test___array__():
ray_df = create_test_dataframe()
with pytest.raises(NotImplementedError):
ray_df.__array__()
@pytest.fixture
def test___array__(ray_df, pandas_df):
assert np.array_equal(ray_df.__array__(), pandas_df.__array__())
def test___array_wrap__():