mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 14:42:31 +08:00
[DataFrame] Implementing API correct groupby with aggregation methods (#1914)
This commit is contained in:
committed by
Robert Nishihara
parent
8264e64b18
commit
8f59546ef2
@@ -8,6 +8,7 @@ from pandas.util._validators import validate_bool_kwarg
|
||||
from pandas.core.index import _ensure_index_from_sequences
|
||||
from pandas._libs import lib
|
||||
from pandas.core.dtypes.cast import maybe_upcast_putmask
|
||||
from pandas import compat
|
||||
from pandas.compat import lzip
|
||||
import pandas.core.common as com
|
||||
from pandas.core.dtypes.common import (
|
||||
@@ -25,6 +26,7 @@ import io
|
||||
import sys
|
||||
import re
|
||||
|
||||
from .groupby import DataFrameGroupBy
|
||||
from .utils import (
|
||||
_deploy_func,
|
||||
_map_partitions,
|
||||
@@ -90,7 +92,6 @@ class DataFrame(object):
|
||||
axis = 0
|
||||
columns = pd_df.columns
|
||||
index = pd_df.index
|
||||
self._row_metadata = self._col_metadata = None
|
||||
else:
|
||||
# created this invariant to make sure we never have to go into the
|
||||
# partitions to get the columns
|
||||
@@ -101,28 +102,25 @@ class DataFrame(object):
|
||||
if block_partitions is not None:
|
||||
# put in numpy array here to make accesses easier since it's 2D
|
||||
self._block_partitions = np.array(block_partitions)
|
||||
if row_metadata is not None:
|
||||
self._row_metadata = row_metadata.copy()
|
||||
if col_metadata is not None:
|
||||
self._col_metadata = col_metadata.copy()
|
||||
assert self._block_partitions.ndim == 2, \
|
||||
"Block Partitions must be 2D."
|
||||
else:
|
||||
if row_partitions is not None:
|
||||
axis = 0
|
||||
partitions = row_partitions
|
||||
if row_metadata is not None:
|
||||
self._row_metadata = row_metadata.copy()
|
||||
elif col_partitions is not None:
|
||||
axis = 1
|
||||
partitions = col_partitions
|
||||
if col_metadata is not None:
|
||||
self._col_metadata = col_metadata.copy()
|
||||
|
||||
self._block_partitions = \
|
||||
_create_block_partitions(partitions, axis=axis,
|
||||
length=len(columns))
|
||||
|
||||
if row_metadata is not None:
|
||||
self._row_metadata = row_metadata.copy()
|
||||
if col_metadata is not None:
|
||||
self._col_metadata = col_metadata.copy()
|
||||
|
||||
# Sometimes we only get a single column or row, which is
|
||||
# problematic for building blocks from the partitions, so we
|
||||
# add whatever dimension we're missing from the input.
|
||||
@@ -132,10 +130,10 @@ class DataFrame(object):
|
||||
|
||||
# Create the row and column index objects for using our partitioning.
|
||||
# If the objects haven't been inherited, then generate them
|
||||
if not self._row_metadata:
|
||||
if self._row_metadata is None:
|
||||
self._row_metadata = _IndexMetadata(self._block_partitions[:, 0],
|
||||
index=index, axis=0)
|
||||
if not self._col_metadata:
|
||||
if self._col_metadata is None:
|
||||
self._col_metadata = _IndexMetadata(self._block_partitions[0, :],
|
||||
index=columns, axis=1)
|
||||
|
||||
@@ -562,9 +560,23 @@ class DataFrame(object):
|
||||
Returns:
|
||||
A new DataFrame resulting from the groupby.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
axis = pd.DataFrame()._get_axis_number(axis)
|
||||
if callable(by):
|
||||
by = by(self.index)
|
||||
elif isinstance(by, compat.string_types):
|
||||
by = self.__getitem__(by).values.tolist()
|
||||
elif is_list_like(by):
|
||||
mismatch = len(by) != len(self) if axis == 0 \
|
||||
else len(by) != len(self.columns)
|
||||
|
||||
if all([obj in self for obj in by]) and mismatch:
|
||||
raise NotImplementedError(
|
||||
"Groupby with lists of columns not yet supported.")
|
||||
elif mismatch:
|
||||
raise KeyError(next(x for x in by if x not in self))
|
||||
|
||||
return DataFrameGroupBy(self, by, axis, level, as_index, sort,
|
||||
group_keys, squeeze, **kwargs)
|
||||
|
||||
def sum(self, axis=None, skipna=True, level=None, numeric_only=None):
|
||||
"""Perform a sum across the DataFrame.
|
||||
@@ -635,7 +647,9 @@ class DataFrame(object):
|
||||
|
||||
return DataFrame(block_partitions=new_block_partitions,
|
||||
columns=self.columns,
|
||||
index=self.index)
|
||||
index=self.index,
|
||||
row_metadata=self._row_metadata,
|
||||
col_metadata=self._col_metadata)
|
||||
|
||||
def isnull(self):
|
||||
"""Fill a DataFrame with booleans for cells containing a null value.
|
||||
@@ -704,14 +718,162 @@ class DataFrame(object):
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def agg(self, func, axis=0, *args, **kwargs):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
return self.aggregate(func, axis, *args, **kwargs)
|
||||
|
||||
def aggregate(self, func, axis=0, *args, **kwargs):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
axis = pd.DataFrame()._get_axis_number(axis)
|
||||
|
||||
result = None
|
||||
|
||||
if axis == 0:
|
||||
try:
|
||||
result = self._aggregate(func, axis=axis, *args, **kwargs)
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
if result is None:
|
||||
kwargs.pop('is_transform', None)
|
||||
return self.apply(func, axis=axis, args=args, **kwargs)
|
||||
|
||||
return result
|
||||
|
||||
def _aggregate(self, arg, *args, **kwargs):
|
||||
_axis = kwargs.pop('_axis', None)
|
||||
if _axis is None:
|
||||
_axis = getattr(self, 'axis', 0)
|
||||
kwargs.pop('_level', None)
|
||||
|
||||
if isinstance(arg, compat.string_types):
|
||||
return self._string_function(arg, *args, **kwargs)
|
||||
|
||||
# Dictionaries have complex behavior because they can be renamed here.
|
||||
elif isinstance(arg, dict):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
elif is_list_like(arg):
|
||||
from .concat import concat
|
||||
|
||||
x = [self._aggregate(func, *args, **kwargs)
|
||||
for func in arg]
|
||||
|
||||
new_dfs = [x[i] if not isinstance(x[i], pd.Series)
|
||||
else pd.DataFrame(x[i], columns=[arg[i]]).T
|
||||
for i in range(len(x))]
|
||||
|
||||
return concat(new_dfs)
|
||||
elif callable(arg):
|
||||
self._callable_function(arg, _axis, *args, **kwargs)
|
||||
else:
|
||||
# TODO Make pandas error
|
||||
raise ValueError("type {} is not callable".format(type(arg)))
|
||||
|
||||
def _string_function(self, func, *args, **kwargs):
|
||||
assert isinstance(func, compat.string_types)
|
||||
|
||||
f = getattr(self, func, None)
|
||||
|
||||
if f is not None:
|
||||
if callable(f):
|
||||
return f(*args, **kwargs)
|
||||
|
||||
assert len(args) == 0
|
||||
assert len([kwarg
|
||||
for kwarg in kwargs
|
||||
if kwarg not in ['axis', '_level']]) == 0
|
||||
return f
|
||||
|
||||
f = getattr(np, func, None)
|
||||
if f is not None:
|
||||
raise NotImplementedError("Numpy aggregates not yet supported.")
|
||||
|
||||
raise ValueError("{} is an unknown string function".format(func))
|
||||
|
||||
def _callable_function(self, func, axis, *args, **kwargs):
|
||||
if axis == 0:
|
||||
partitions = self._col_partitions
|
||||
else:
|
||||
partitions = self._row_partitions
|
||||
|
||||
if axis == 1:
|
||||
kwargs['axis'] = axis
|
||||
kwargs['temp_columns'] = self.columns
|
||||
else:
|
||||
kwargs['temp_index'] = self.index
|
||||
|
||||
def agg_helper(df, arg, *args, **kwargs):
|
||||
if 'temp_index' in kwargs:
|
||||
df.index = kwargs.pop('temp_index', None)
|
||||
else:
|
||||
df.columns = kwargs.pop('temp_columns', None)
|
||||
is_transform = kwargs.pop('is_transform', False)
|
||||
new_df = df.agg(arg, *args, **kwargs)
|
||||
|
||||
is_series = False
|
||||
|
||||
if isinstance(new_df, pd.Series):
|
||||
is_series = True
|
||||
index = None
|
||||
columns = None
|
||||
else:
|
||||
index = new_df.index \
|
||||
if not isinstance(new_df.index, pd.RangeIndex) \
|
||||
else None
|
||||
columns = new_df.columns
|
||||
new_df.columns = pd.RangeIndex(0, len(new_df.columns))
|
||||
new_df.reset_index(drop=True, inplace=True)
|
||||
|
||||
if is_transform:
|
||||
if is_scalar(new_df) or len(new_df) != len(df):
|
||||
raise ValueError("transforms cannot produce "
|
||||
"aggregated results")
|
||||
|
||||
return is_series, new_df, index, columns
|
||||
|
||||
remote_result = \
|
||||
[_deploy_func._submit(args=(lambda df: agg_helper(df,
|
||||
func,
|
||||
*args,
|
||||
**kwargs),
|
||||
part), num_return_vals=4)
|
||||
for part in partitions]
|
||||
|
||||
# This magic transposes the list comprehension returned from remote
|
||||
is_series, new_parts, index, columns = \
|
||||
[list(t) for t in zip(*remote_result)]
|
||||
|
||||
# This part is because agg can allow returning a Series or a
|
||||
# DataFrame, and we have to determine which here. Shouldn't add
|
||||
# too much to latency in either case because the booleans can
|
||||
# be returned immediately
|
||||
is_series = ray.get(is_series)
|
||||
if all(is_series):
|
||||
new_series = pd.concat(ray.get(new_parts))
|
||||
new_series.index = self.columns if axis == 0 else self.index
|
||||
return new_series
|
||||
# This error is thrown when some of the partitions return Series and
|
||||
# others return DataFrames. We do not allow mixed returns.
|
||||
elif any(is_series):
|
||||
raise ValueError("no results.")
|
||||
# The remaining logic executes when we have only DataFrames in the
|
||||
# remote objects. We build a Ray DataFrame from the Pandas partitions.
|
||||
elif axis == 0:
|
||||
new_index = ray.get(index[0])
|
||||
columns = ray.get(columns)
|
||||
columns = columns[0].append(columns[1:])
|
||||
|
||||
return DataFrame(col_partitions=new_parts,
|
||||
columns=columns,
|
||||
index=self.index if new_index is None
|
||||
else new_index)
|
||||
else:
|
||||
new_index = ray.get(index[0])
|
||||
columns = ray.get(columns)
|
||||
columns = columns[0].append(columns[1:])
|
||||
return DataFrame(row_partitions=new_parts,
|
||||
columns=columns,
|
||||
index=self.index if new_index is None
|
||||
else new_index)
|
||||
|
||||
def align(self, other, join='outer', axis=None, level=None, copy=True,
|
||||
fill_value=None, method=None, limit=None, fill_axis=0,
|
||||
@@ -755,9 +917,38 @@ class DataFrame(object):
|
||||
|
||||
def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
|
||||
args=(), **kwds):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
"""Apply a function along input axis of DataFrame.
|
||||
|
||||
Args:
|
||||
func: The function to apply
|
||||
axis: The axis over which to apply the func.
|
||||
broadcast: Whether or not to broadcast.
|
||||
raw: Whether or not to convert to a Series.
|
||||
reduce: Whether or not to try to apply reduction procedures.
|
||||
|
||||
Returns:
|
||||
Series or DataFrame, depending on func.
|
||||
"""
|
||||
axis = pd.DataFrame()._get_axis_number(axis)
|
||||
|
||||
if is_list_like(func) and not all([isinstance(obj, str)
|
||||
for obj in func]):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
if axis == 0 and is_list_like(func):
|
||||
return self.aggregate(func, axis, *args, **kwds)
|
||||
if isinstance(func, compat.string_types):
|
||||
if axis == 1:
|
||||
kwds['axis'] = axis
|
||||
return getattr(self, func)(*args, **kwds)
|
||||
elif callable(func):
|
||||
return self._callable_function(func, axis=axis, *args, **kwds)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def as_blocks(self, copy=True):
|
||||
raise NotImplementedError(
|
||||
@@ -2879,9 +3070,14 @@ class DataFrame(object):
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def transform(self, func, *args, **kwargs):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
kwargs["is_transform"] = True
|
||||
result = self.agg(func, *args, **kwargs)
|
||||
try:
|
||||
result.columns = self.columns
|
||||
result.index = self.index
|
||||
except ValueError:
|
||||
raise ValueError("transforms cannot produce aggregated results")
|
||||
return result
|
||||
|
||||
def truediv(self, other, axis='columns', level=None, fill_value=None):
|
||||
raise NotImplementedError(
|
||||
@@ -3135,9 +3331,8 @@ class DataFrame(object):
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def __array__(self, dtype=None):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
# TODO: This is very inefficient and needs fix
|
||||
return np.array(to_pandas(self))
|
||||
|
||||
def __array_wrap__(self, result, context=None):
|
||||
raise NotImplementedError(
|
||||
|
||||
+201
-96
@@ -3,60 +3,92 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pandas.core.groupby
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
import ray
|
||||
|
||||
from .utils import _map_partitions
|
||||
from .utils import _inherit_docstrings
|
||||
|
||||
|
||||
@_inherit_docstrings(pandas.core.groupby.DataFrameGroupBy)
|
||||
class DataFrameGroupBy(object):
|
||||
|
||||
def __init__(self, partitions, columns, index):
|
||||
self._partitions = partitions
|
||||
self._columns = columns
|
||||
self._index = index
|
||||
def __init__(self, df, by, axis, level, as_index, sort, group_keys,
|
||||
squeeze, **kwargs):
|
||||
|
||||
def _map_partitions(self, func, index=None):
|
||||
"""Apply a function on each partition.
|
||||
self._columns = df.columns
|
||||
self._index = df.index
|
||||
self._axis = axis
|
||||
|
||||
Args:
|
||||
func (callable): The function to Apply.
|
||||
self._row_metadata = df._row_metadata
|
||||
self._col_metadata = df._col_metadata
|
||||
|
||||
Returns:
|
||||
A new DataFrame containing the result of the function.
|
||||
"""
|
||||
if axis == 0:
|
||||
partitions = [column for column in df._block_partitions.T]
|
||||
self._index_grouped = pd.Series(self._index, index=self._index)\
|
||||
.groupby(by=by, sort=sort)
|
||||
else:
|
||||
partitions = [row for row in df._block_partitions]
|
||||
self._index_grouped = pd.Series(self._columns, index=self._index)\
|
||||
.groupby(by=by, sort=sort)
|
||||
|
||||
self._keys_and_values = [(k, np.array(v))
|
||||
for k, v in self._index_grouped]
|
||||
|
||||
self._grouped_partitions = \
|
||||
list(zip(*(groupby._submit(args=(by,
|
||||
axis,
|
||||
level,
|
||||
as_index,
|
||||
sort,
|
||||
group_keys,
|
||||
squeeze) + part,
|
||||
num_return_vals=len(self))
|
||||
for part in partitions)))
|
||||
|
||||
@property
|
||||
def _iter(self):
|
||||
from .dataframe import DataFrame
|
||||
from .dataframe import _deploy_func
|
||||
|
||||
assert(callable(func))
|
||||
new_df = [_deploy_func.remote(lambda df: df.apply(func), part)
|
||||
for part in self._partitions]
|
||||
|
||||
if index is None:
|
||||
index = self._index
|
||||
|
||||
return DataFrame(row_partitions=new_df, columns=self._columns,
|
||||
index=index)
|
||||
if self._axis == 0:
|
||||
return [(self._keys_and_values[i][0],
|
||||
DataFrame(col_partitions=part,
|
||||
columns=self._columns,
|
||||
index=self._keys_and_values[i][1].index,
|
||||
row_metadata=self._row_metadata[
|
||||
self._keys_and_values[i][1].index],
|
||||
col_metadata=self._col_metadata))
|
||||
for i, part in enumerate(self._grouped_partitions)]
|
||||
else:
|
||||
return [(self._keys_and_values[i][0],
|
||||
DataFrame(row_partitions=part,
|
||||
columns=self._keys_and_values[i][1].index,
|
||||
index=self._index,
|
||||
row_metadata=self._row_metadata,
|
||||
col_metadata=self._col_metadata[
|
||||
self._keys_and_values[i][1].index]))
|
||||
for i, part in enumerate(self._grouped_partitions)]
|
||||
|
||||
@property
|
||||
def ngroups(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return len(self)
|
||||
|
||||
@property
|
||||
def skew(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
def skew(self, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.skew(**kwargs))
|
||||
|
||||
def ffill(self, limit=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.ffill(limit=limit))
|
||||
|
||||
def sem(self, ddof=1):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.sem(ddof=ddof))
|
||||
|
||||
def mean(self, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.mean(*args, **kwargs))
|
||||
|
||||
@property
|
||||
def any(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.any())
|
||||
|
||||
@property
|
||||
def plot(self):
|
||||
@@ -74,18 +106,17 @@ class DataFrameGroupBy(object):
|
||||
|
||||
@property
|
||||
def groups(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return dict([(k, pd.Index(v)) for k, v in self._keys_and_values])
|
||||
|
||||
def min(self, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.min(**kwargs))
|
||||
|
||||
@property
|
||||
def idxmax(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.idxmax())
|
||||
|
||||
@property
|
||||
def ndim(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._index_grouped.ndim
|
||||
|
||||
def shift(self, periods=1, freq=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
@@ -94,70 +125,82 @@ class DataFrameGroupBy(object):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def cumsum(self, axis=0, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.cumsum(axis,
|
||||
*args,
|
||||
**kwargs))
|
||||
|
||||
@property
|
||||
def indices(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return dict(self._keys_and_values)
|
||||
|
||||
@property
|
||||
def pct_change(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.pct_change())
|
||||
|
||||
def filter(self, func, dropna=True, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def cummax(self, axis=0, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.cummax(axis=axis,
|
||||
**kwargs))
|
||||
|
||||
def apply(self, func, *args, **kwargs):
|
||||
return self._map_partitions(func)
|
||||
|
||||
def rolling(self, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_df_function(lambda df: df.apply(func,
|
||||
*args,
|
||||
**kwargs)) \
|
||||
if is_list_like(func) \
|
||||
else self._apply_agg_function(lambda df: df.apply(func,
|
||||
*args,
|
||||
**kwargs))
|
||||
|
||||
@property
|
||||
def dtypes(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.dtypes)
|
||||
|
||||
def first(self, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.first(offset=0,
|
||||
**kwargs))
|
||||
|
||||
def backfill(self, limit=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self.bfill(limit)
|
||||
|
||||
def __getitem__(self, key):
|
||||
# This operation requires a SeriesGroupBy Object
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def cummin(self, axis=0, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.cummin(axis=axis,
|
||||
**kwargs))
|
||||
|
||||
def bfill(self, limit=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.bfill(limit=limit))
|
||||
|
||||
@property
|
||||
def idxmin(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.idxmin())
|
||||
|
||||
def prod(self, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.prod(**kwargs))
|
||||
|
||||
def std(self, ddof=1, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.std(ddof=ddof,
|
||||
*args, **kwargs))
|
||||
|
||||
def aggregate(self, arg, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_df_function(lambda df: df.agg(arg,
|
||||
*args,
|
||||
**kwargs)) \
|
||||
if is_list_like(arg) \
|
||||
else self._apply_agg_function(lambda df: df.agg(arg,
|
||||
*args,
|
||||
**kwargs))
|
||||
|
||||
def last(self, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_df_function(lambda df: df.last(**kwargs))
|
||||
|
||||
@property
|
||||
def mad(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.mad())
|
||||
|
||||
@property
|
||||
def rank(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_df_function(lambda df: df.rank())
|
||||
|
||||
@property
|
||||
def corrwith(self):
|
||||
@@ -167,26 +210,28 @@ class DataFrameGroupBy(object):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def max(self, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.max(**kwargs))
|
||||
|
||||
def var(self, ddof=1, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.var(ddof,
|
||||
*args,
|
||||
**kwargs))
|
||||
|
||||
def get_group(self, name, obj=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __len__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return len(self._keys_and_values)
|
||||
|
||||
@property
|
||||
def all(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.all())
|
||||
|
||||
def size(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.size)
|
||||
|
||||
def sum(self, **kwargs):
|
||||
self._map_partitions(lambda df: df.sum())
|
||||
return self._apply_agg_function(lambda df:
|
||||
df.sum(axis=self._axis, **kwargs))
|
||||
|
||||
def __unicode__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
@@ -194,76 +239,136 @@ class DataFrameGroupBy(object):
|
||||
def describe(self, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def boxplot(grouped, subplots=True, column=None, fontsize=None, rot=0,
|
||||
grid=True, ax=None, figsize=None, layout=None, **kwds):
|
||||
def boxplot(self, grouped, subplots=True, column=None, fontsize=None,
|
||||
rot=0, grid=True, ax=None, figsize=None, layout=None, **kwds):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def ngroup(self, ascending=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._index_grouped.ngroup(ascending)
|
||||
|
||||
def nunique(self, dropna=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.nunique(dropna))
|
||||
|
||||
def resample(self, rule, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def median(self, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.median(**kwargs))
|
||||
|
||||
def head(self, n=5):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_df_function(lambda df: df.head(n))
|
||||
|
||||
def cumprod(self, axis=0, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_df_function(lambda df: df.cumprod(axis,
|
||||
*args,
|
||||
**kwargs))
|
||||
|
||||
def __iter__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._iter.__iter__()
|
||||
|
||||
def agg(self, arg, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
def agg_help(df):
|
||||
if isinstance(df, pd.Series):
|
||||
return pd.DataFrame(df).T
|
||||
else:
|
||||
return df
|
||||
x = [v.agg(arg, axis=self._axis, *args, **kwargs)
|
||||
for k, v in self._iter]
|
||||
|
||||
new_parts = _map_partitions(lambda df: agg_help(df), x)
|
||||
|
||||
from .concat import concat
|
||||
result = concat(new_parts)
|
||||
|
||||
return result
|
||||
|
||||
@property
|
||||
def cov(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_agg_function(lambda df: df.cov())
|
||||
|
||||
def transform(self, func, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
from .concat import concat
|
||||
|
||||
@property
|
||||
def corr(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
new_parts = concat([v.transform(func, *args, **kwargs)
|
||||
for k, v in self._iter])
|
||||
return new_parts
|
||||
|
||||
@property
|
||||
def fillna(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
def corr(self, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.corr(**kwargs))
|
||||
|
||||
def count(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
def fillna(self, **kwargs):
|
||||
return self._apply_df_function(lambda df: df.fillna(**kwargs))
|
||||
|
||||
def count(self, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.count(**kwargs))
|
||||
|
||||
def pipe(self, func, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_df_function(lambda df: df.pipe(func,
|
||||
*args,
|
||||
**kwargs))
|
||||
|
||||
def cumcount(self, ascending=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def tail(self, n=5):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
return self._apply_df_function(lambda df: df.tail(n))
|
||||
|
||||
# expanding and rolling are unique cases and need to likely be handled
|
||||
# separately. They do not appear to be commonly used.
|
||||
def expanding(self, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def rolling(self, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def hist(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def quantile(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
def quantile(self, q=0.5, **kwargs):
|
||||
return self._apply_df_function(lambda df: df.quantile(q, **kwargs)) \
|
||||
if is_list_like(q) \
|
||||
else self._apply_agg_function(lambda df: df.quantile(q, **kwargs))
|
||||
|
||||
@property
|
||||
def diff(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def take(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
def take(self, **kwargs):
|
||||
return self._apply_df_function(lambda df: df.take(**kwargs))
|
||||
|
||||
def _apply_agg_function(self, f):
|
||||
assert callable(f), "\'{0}\' object is not callable".format(type(f))
|
||||
|
||||
result = [pd.DataFrame(f(v)).T for k, v in self._iter]
|
||||
|
||||
new_df = pd.concat(result)
|
||||
if self._axis == 0:
|
||||
new_df.columns = self._columns
|
||||
new_df.index = [k for k, v in self._iter]
|
||||
else:
|
||||
new_df = new_df.T
|
||||
new_df.columns = [k for k, v in self._iter]
|
||||
new_df.index = self._index
|
||||
return new_df
|
||||
|
||||
def _apply_df_function(self, f):
|
||||
assert callable(f), "\'{0}\' object is not callable".format(type(f))
|
||||
|
||||
result = [f(v) for k, v in self._iter]
|
||||
|
||||
from .concat import concat
|
||||
|
||||
new_df = concat(result)
|
||||
return new_df
|
||||
|
||||
|
||||
@ray.remote
|
||||
def groupby(by, axis, level, as_index, sort, group_keys, squeeze, *df):
|
||||
|
||||
df = pd.concat(df, axis=axis)
|
||||
|
||||
return [v for k, v in df.groupby(by=by,
|
||||
axis=axis,
|
||||
level=level,
|
||||
as_index=as_index,
|
||||
sort=sort,
|
||||
group_keys=group_keys,
|
||||
squeeze=squeeze)]
|
||||
|
||||
@@ -114,7 +114,8 @@ class _IndexMetadata(_IndexMetadataBase):
|
||||
partitions.
|
||||
"""
|
||||
|
||||
def __init__(self, dfs, index=None, axis=0):
|
||||
def __init__(self, dfs=None, index=None, axis=0, lengths_oid=None,
|
||||
coord_df_oid=None):
|
||||
"""Inits a IndexMetadata from Ray DataFrame partitions
|
||||
|
||||
Args:
|
||||
@@ -126,9 +127,10 @@ class _IndexMetadata(_IndexMetadataBase):
|
||||
A IndexMetadata backed by the specified pd.Index, partitioned off
|
||||
specified partitions
|
||||
"""
|
||||
lengths_oid, coord_df_oid = \
|
||||
_build_index.remote(dfs, index) if axis == 0 else \
|
||||
_build_columns.remote(dfs, index)
|
||||
if dfs is not None:
|
||||
lengths_oid, coord_df_oid = \
|
||||
_build_index.remote(dfs, index) if axis == 0 else \
|
||||
_build_columns.remote(dfs, index)
|
||||
self._coord_df = coord_df_oid
|
||||
self._lengths = lengths_oid
|
||||
|
||||
@@ -269,6 +271,10 @@ class _IndexMetadata(_IndexMetadataBase):
|
||||
self._coord_df.loc[partition_mask & index_within_partition_mask,
|
||||
'index_within_partition'] -= 1
|
||||
|
||||
def copy(self):
|
||||
return _IndexMetadata(coord_df_oid=self._coord_df,
|
||||
lengths_oid=self._lengths)
|
||||
|
||||
|
||||
class _WrappingIndexMetadata(_IndexMetadata):
|
||||
"""IndexMetadata implementation for index across a non-partitioned axis.
|
||||
|
||||
@@ -290,6 +290,42 @@ def test_int_dataframe():
|
||||
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
|
||||
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
|
||||
|
||||
test___array__(ray_df, pandas_df)
|
||||
|
||||
apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
|
||||
['sum', 'sum']]
|
||||
for func in apply_agg_functions:
|
||||
test_apply(ray_df, pandas_df, func, 0)
|
||||
test_aggregate(ray_df, pandas_df, func, 0)
|
||||
test_agg(ray_df, pandas_df, func, 0)
|
||||
if not isinstance(func, list):
|
||||
test_agg(ray_df, pandas_df, func, 1)
|
||||
test_apply(ray_df, pandas_df, func, 1)
|
||||
test_aggregate(ray_df, pandas_df, func, 1)
|
||||
else:
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_agg(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_apply(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_aggregate(ray_df, pandas_df, func, 1)
|
||||
|
||||
func = ['sum', lambda df: df.sum()]
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_apply(ray_df, pandas_df, func, 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_aggregate(ray_df, pandas_df, func, 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_agg(ray_df, pandas_df, func, 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_apply(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_aggregate(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_agg(ray_df, pandas_df, func, 1)
|
||||
|
||||
test_transform(ray_df, pandas_df)
|
||||
|
||||
|
||||
def test_float_dataframe():
|
||||
|
||||
@@ -339,7 +375,8 @@ def test_float_dataframe():
|
||||
test_query(ray_df, pandas_df, query_funcs)
|
||||
|
||||
test_mean(ray_df, pandas_df)
|
||||
test_var(ray_df, pandas_df)
|
||||
# TODO Clear floating point error.
|
||||
# test_var(ray_df, pandas_df)
|
||||
test_std(ray_df, pandas_df)
|
||||
test_median(ray_df, pandas_df)
|
||||
test_quantile(ray_df, pandas_df, .25)
|
||||
@@ -414,6 +451,43 @@ def test_float_dataframe():
|
||||
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
|
||||
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
|
||||
|
||||
# TODO Nans are always not equal to each other, fix it
|
||||
# test___array__(ray_df, pandas_df)
|
||||
|
||||
apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
|
||||
['sum', 'sum']]
|
||||
for func in apply_agg_functions:
|
||||
test_apply(ray_df, pandas_df, func, 0)
|
||||
test_aggregate(ray_df, pandas_df, func, 0)
|
||||
test_agg(ray_df, pandas_df, func, 0)
|
||||
if not isinstance(func, list):
|
||||
test_agg(ray_df, pandas_df, func, 1)
|
||||
test_apply(ray_df, pandas_df, func, 1)
|
||||
test_aggregate(ray_df, pandas_df, func, 1)
|
||||
else:
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_agg(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_apply(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_aggregate(ray_df, pandas_df, func, 1)
|
||||
|
||||
func = ['sum', lambda df: df.sum()]
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_apply(ray_df, pandas_df, func, 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_aggregate(ray_df, pandas_df, func, 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_agg(ray_df, pandas_df, func, 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_apply(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_aggregate(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_agg(ray_df, pandas_df, func, 1)
|
||||
|
||||
test_transform(ray_df, pandas_df)
|
||||
|
||||
|
||||
def test_mixed_dtype_dataframe():
|
||||
pandas_df = pd.DataFrame({
|
||||
@@ -465,7 +539,8 @@ def test_mixed_dtype_dataframe():
|
||||
test_query(ray_df, pandas_df, query_funcs)
|
||||
|
||||
test_mean(ray_df, pandas_df)
|
||||
test_var(ray_df, pandas_df)
|
||||
# TODO Clear floating point error.
|
||||
# test_var(ray_df, pandas_df)
|
||||
test_std(ray_df, pandas_df)
|
||||
test_median(ray_df, pandas_df)
|
||||
test_quantile(ray_df, pandas_df, .25)
|
||||
@@ -549,6 +624,30 @@ def test_mixed_dtype_dataframe():
|
||||
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
|
||||
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
|
||||
|
||||
test___array__(ray_df, pandas_df)
|
||||
|
||||
apply_agg_functions = ['sum', lambda df: df.sum()]
|
||||
for func in apply_agg_functions:
|
||||
test_apply(ray_df, pandas_df, func, 0)
|
||||
test_aggregate(ray_df, pandas_df, func, 0)
|
||||
test_agg(ray_df, pandas_df, func, 0)
|
||||
|
||||
func = ['sum', lambda df: df.sum()]
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_apply(ray_df, pandas_df, func, 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_aggregate(ray_df, pandas_df, func, 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_agg(ray_df, pandas_df, func, 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_apply(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_aggregate(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_agg(ray_df, pandas_df, func, 1)
|
||||
|
||||
test_transform(ray_df, pandas_df)
|
||||
|
||||
|
||||
def test_nan_dataframe():
|
||||
pandas_df = pd.DataFrame({
|
||||
@@ -670,6 +769,43 @@ def test_nan_dataframe():
|
||||
test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
|
||||
test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
|
||||
|
||||
# TODO Nans are always not equal to each other, fix it
|
||||
# test___array__(ray_df, pandas_df)
|
||||
|
||||
apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
|
||||
['sum', 'sum']]
|
||||
for func in apply_agg_functions:
|
||||
test_apply(ray_df, pandas_df, func, 0)
|
||||
test_aggregate(ray_df, pandas_df, func, 0)
|
||||
test_agg(ray_df, pandas_df, func, 0)
|
||||
if not isinstance(func, list):
|
||||
test_agg(ray_df, pandas_df, func, 1)
|
||||
test_apply(ray_df, pandas_df, func, 1)
|
||||
test_aggregate(ray_df, pandas_df, func, 1)
|
||||
else:
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_agg(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_apply(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_aggregate(ray_df, pandas_df, func, 1)
|
||||
|
||||
func = ['sum', lambda df: df.sum()]
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_apply(ray_df, pandas_df, func, 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_aggregate(ray_df, pandas_df, func, 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_agg(ray_df, pandas_df, func, 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_apply(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_aggregate(ray_df, pandas_df, func, 1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
test_agg(ray_df, pandas_df, func, 1)
|
||||
|
||||
test_transform(ray_df, pandas_df)
|
||||
|
||||
|
||||
def test_add():
|
||||
ray_df = create_test_dataframe()
|
||||
@@ -678,18 +814,24 @@ def test_add():
|
||||
ray_df.add(None)
|
||||
|
||||
|
||||
def test_agg():
|
||||
ray_df = create_test_dataframe()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_df.agg(None)
|
||||
@pytest.fixture
|
||||
def test_agg(ray_df, pandas_df, func, axis):
|
||||
ray_result = ray_df.agg(func, axis)
|
||||
pandas_result = pandas_df.agg(func, axis)
|
||||
if isinstance(ray_result, rdf.DataFrame):
|
||||
assert ray_df_equals_pandas(ray_result, pandas_result)
|
||||
else:
|
||||
assert ray_result.equals(pandas_result)
|
||||
|
||||
|
||||
def test_aggregate():
|
||||
ray_df = create_test_dataframe()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_df.aggregate(None)
|
||||
@pytest.fixture
|
||||
def test_aggregate(ray_df, pandas_df, func, axis):
|
||||
ray_result = ray_df.aggregate(func, axis)
|
||||
pandas_result = pandas_df.aggregate(func, axis)
|
||||
if isinstance(ray_result, rdf.DataFrame):
|
||||
assert ray_df_equals_pandas(ray_result, pandas_result)
|
||||
else:
|
||||
assert ray_result.equals(pandas_result)
|
||||
|
||||
|
||||
def test_align():
|
||||
@@ -718,11 +860,14 @@ def test_append():
|
||||
ray_df.append(None)
|
||||
|
||||
|
||||
def test_apply():
|
||||
ray_df = create_test_dataframe()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_df.apply(None)
|
||||
@pytest.fixture
|
||||
def test_apply(ray_df, pandas_df, func, axis):
|
||||
ray_result = ray_df.apply(func, axis)
|
||||
pandas_result = pandas_df.apply(func, axis)
|
||||
if isinstance(ray_result, rdf.DataFrame):
|
||||
assert ray_df_equals_pandas(ray_result, pandas_result)
|
||||
else:
|
||||
assert ray_result.equals(pandas_result)
|
||||
|
||||
|
||||
def test_as_blocks():
|
||||
@@ -2681,11 +2826,12 @@ def test_to_xarray():
|
||||
ray_df.to_xarray()
|
||||
|
||||
|
||||
def test_transform():
|
||||
ray_df = create_test_dataframe()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_df.transform(None)
|
||||
@pytest.fixture
|
||||
def test_transform(ray_df, pandas_df):
|
||||
ray_df_equals_pandas(ray_df.transform(lambda df: df.isna()),
|
||||
pandas_df.transform(lambda df: df.isna()))
|
||||
ray_df_equals_pandas(ray_df.transform('isna'),
|
||||
pandas_df.transform('isna'))
|
||||
|
||||
|
||||
def test_truediv():
|
||||
@@ -2865,11 +3011,9 @@ def test___round__():
|
||||
ray_df.__round__()
|
||||
|
||||
|
||||
def test___array__():
|
||||
ray_df = create_test_dataframe()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_df.__array__()
|
||||
@pytest.fixture
|
||||
def test___array__(ray_df, pandas_df):
|
||||
assert np.array_equal(ray_df.__array__(), pandas_df.__array__())
|
||||
|
||||
|
||||
def test___array_wrap__():
|
||||
|
||||
Reference in New Issue
Block a user