mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 01:00:10 +08:00
Dataframe deprecation (#2353)
This commit is contained in:
committed by
Robert Nishihara
parent
4f1d14e176
commit
4185aaed10
@@ -1,60 +1,2 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pandas
|
||||
# TODO: In the future `set_option` or similar needs to run on every node
|
||||
# in order to keep all pandas instances across nodes consistent
|
||||
from pandas import (eval, unique, value_counts, cut, to_numeric, factorize,
|
||||
test, qcut, match, Panel, date_range, Index, MultiIndex,
|
||||
CategoricalIndex, Series, bdate_range, DatetimeIndex,
|
||||
Timedelta, Timestamp, to_timedelta, set_eng_float_format,
|
||||
set_option, NaT, PeriodIndex, Categorical)
|
||||
import threading
|
||||
|
||||
pandas_version = pandas.__version__
|
||||
pandas_major = int(pandas_version.split(".")[0])
|
||||
pandas_minor = int(pandas_version.split(".")[1])
|
||||
|
||||
if pandas_major == 0 and pandas_minor != 22:
|
||||
raise Exception("In order to use Pandas on Ray, your pandas version must "
|
||||
"be 0.22. You can run 'pip install pandas==0.22'")
|
||||
|
||||
DEFAULT_NPARTITIONS = 8
|
||||
|
||||
|
||||
def set_npartition_default(n):
|
||||
global DEFAULT_NPARTITIONS
|
||||
DEFAULT_NPARTITIONS = n
|
||||
|
||||
|
||||
def get_npartitions():
|
||||
return DEFAULT_NPARTITIONS
|
||||
|
||||
|
||||
# We import these file after above two function
|
||||
# because they depend on npartitions.
|
||||
from .dataframe import DataFrame # noqa: 402
|
||||
from .io import (read_csv, read_parquet, read_json, read_html, # noqa: 402
|
||||
read_clipboard, read_excel, read_hdf, read_feather, # noqa: 402
|
||||
read_msgpack, read_stata, read_sas, read_pickle, # noqa: 402
|
||||
read_sql) # noqa: 402
|
||||
from .concat import concat # noqa: 402
|
||||
from .datetimes import to_datetime # noqa: 402
|
||||
from .reshape import get_dummies # noqa: 402
|
||||
|
||||
__all__ = [
|
||||
"DataFrame", "Series", "read_csv", "read_parquet", "concat", "eval",
|
||||
"unique", "value_counts", "cut", "to_numeric", "factorize", "test", "qcut",
|
||||
"match", "to_datetime", "get_dummies", "Panel", "date_range", "Index",
|
||||
"MultiIndex", "Series", "bdate_range", "DatetimeIndex", "to_timedelta",
|
||||
"set_eng_float_format", "set_option", "CategoricalIndex", "Timedelta",
|
||||
"Timestamp", "NaT", "PeriodIndex", "Categorical"
|
||||
]
|
||||
|
||||
try:
|
||||
if threading.current_thread().name == "MainThread":
|
||||
import ray
|
||||
ray.init()
|
||||
except AssertionError:
|
||||
pass
|
||||
raise DeprecationWarning("Pandas on Ray has moved to Modin: "
|
||||
"github.com/modin-project/modin")
|
||||
|
||||
@@ -1,133 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pandas
|
||||
import numpy as np
|
||||
from .dataframe import DataFrame
|
||||
from .utils import _reindex_helper
|
||||
|
||||
|
||||
def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
|
||||
keys=None, levels=None, names=None, verify_integrity=False,
|
||||
copy=True):
|
||||
|
||||
if keys is not None:
|
||||
objs = [objs[k] for k in keys]
|
||||
else:
|
||||
objs = list(objs)
|
||||
|
||||
if len(objs) == 0:
|
||||
raise ValueError("No objects to concatenate")
|
||||
|
||||
objs = [obj for obj in objs if obj is not None]
|
||||
|
||||
if len(objs) == 0:
|
||||
raise ValueError("All objects passed were None")
|
||||
|
||||
try:
|
||||
type_check = next(obj for obj in objs
|
||||
if not isinstance(obj, (pandas.Series,
|
||||
pandas.DataFrame,
|
||||
DataFrame)))
|
||||
except StopIteration:
|
||||
type_check = None
|
||||
if type_check is not None:
|
||||
raise ValueError("cannot concatenate object of type \"{0}\"; only "
|
||||
"pandas.Series, pandas.DataFrame, "
|
||||
"and ray.dataframe.DataFrame objs are "
|
||||
"valid", type(type_check))
|
||||
|
||||
all_series = all(isinstance(obj, pandas.Series)
|
||||
for obj in objs)
|
||||
if all_series:
|
||||
return DataFrame(pandas.concat(objs, axis, join, join_axes,
|
||||
ignore_index, keys, levels, names,
|
||||
verify_integrity, copy))
|
||||
|
||||
if isinstance(objs, dict):
|
||||
raise NotImplementedError(
|
||||
"Obj as dicts not implemented. To contribute to "
|
||||
"Pandas on Ray, please visit github.com/ray-project/ray.")
|
||||
|
||||
axis = pandas.DataFrame()._get_axis_number(axis)
|
||||
|
||||
if join not in ['inner', 'outer']:
|
||||
raise ValueError("Only can inner (intersect) or outer (union) join the"
|
||||
" other axis")
|
||||
|
||||
# We need this in a list because we use it later.
|
||||
all_index, all_columns = list(zip(*[(obj.index, obj.columns)
|
||||
for obj in objs]))
|
||||
|
||||
def series_to_df(series, columns):
|
||||
df = pandas.DataFrame(series)
|
||||
df.columns = columns
|
||||
return DataFrame(df)
|
||||
|
||||
# Pandas puts all of the Series in a single column named 0. This is
|
||||
# true regardless of the existence of another column named 0 in the
|
||||
# concat.
|
||||
if axis == 0:
|
||||
objs = [series_to_df(obj, [0])
|
||||
if isinstance(obj, pandas.Series) else obj for obj in objs]
|
||||
else:
|
||||
# Pandas starts the count at 0 so this will increment the names as
|
||||
# long as there's a new nameless Series being added.
|
||||
def name_incrementer(i):
|
||||
val = i[0]
|
||||
i[0] += 1
|
||||
return val
|
||||
|
||||
i = [0]
|
||||
objs = [series_to_df(obj, obj.name if obj.name is not None
|
||||
else name_incrementer(i))
|
||||
if isinstance(obj, pandas.Series) else obj for obj in objs]
|
||||
|
||||
# Using concat on the columns and index is fast because they're empty,
|
||||
# and it forces the error checking. It also puts the columns in the
|
||||
# correct order for us.
|
||||
final_index = \
|
||||
pandas.concat([pandas.DataFrame(index=idx) for idx in all_index],
|
||||
axis=axis, join=join, join_axes=join_axes,
|
||||
ignore_index=ignore_index, keys=keys, levels=levels,
|
||||
names=names, verify_integrity=verify_integrity,
|
||||
copy=False).index
|
||||
final_columns = \
|
||||
pandas.concat([pandas.DataFrame(columns=col)
|
||||
for col in all_columns],
|
||||
axis=axis, join=join, join_axes=join_axes,
|
||||
ignore_index=ignore_index, keys=keys, levels=levels,
|
||||
names=names, verify_integrity=verify_integrity,
|
||||
copy=False).columns
|
||||
|
||||
# Put all of the DataFrames into Ray format
|
||||
# TODO just partition the DataFrames instead of building a new Ray DF.
|
||||
objs = [DataFrame(obj) if isinstance(obj, (pandas.DataFrame,
|
||||
pandas.Series)) else obj
|
||||
for obj in objs]
|
||||
|
||||
# Here we reuse all_columns/index so we don't have to materialize objects
|
||||
# from remote memory built in the previous line. In the future, we won't be
|
||||
# building new DataFrames, rather just partitioning the DataFrames.
|
||||
if axis == 0:
|
||||
new_blocks = np.array([_reindex_helper._submit(
|
||||
args=tuple([all_columns[i], final_columns, axis,
|
||||
len(objs[0]._block_partitions)] + part.tolist()),
|
||||
num_return_vals=len(objs[0]._block_partitions))
|
||||
for i in range(len(objs))
|
||||
for part in objs[i]._block_partitions])
|
||||
else:
|
||||
# Transposing the columns is necessary because the remote task treats
|
||||
# everything like rows and returns in row-major format. Luckily, this
|
||||
# operation is cheap in numpy.
|
||||
new_blocks = np.array([_reindex_helper._submit(
|
||||
args=tuple([all_index[i], final_index, axis,
|
||||
len(objs[0]._block_partitions.T)] + part.tolist()),
|
||||
num_return_vals=len(objs[0]._block_partitions.T))
|
||||
for i in range(len(objs))
|
||||
for part in objs[i]._block_partitions.T]).T
|
||||
|
||||
return DataFrame(block_partitions=new_blocks,
|
||||
columns=final_columns,
|
||||
index=final_index)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,64 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pandas
|
||||
import ray
|
||||
|
||||
from .dataframe import DataFrame
|
||||
from .utils import _map_partitions
|
||||
|
||||
|
||||
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None,
|
||||
box=True, format=None, exact=True, unit=None,
|
||||
infer_datetime_format=False, origin='unix'):
|
||||
"""Convert the arg to datetime format. If not Ray DataFrame, this falls
|
||||
back on pandas.
|
||||
|
||||
Args:
|
||||
errors ('raise' or 'ignore'): If 'ignore', errors are silenced.
|
||||
dayfirst (bool): Date format is passed in as day first.
|
||||
yearfirst (bool): Date format is passed in as year first.
|
||||
utc (bool): retuns a UTC DatetimeIndex if True.
|
||||
box (bool): If True, returns a DatetimeIndex.
|
||||
format (string): strftime to parse time, eg "%d/%m/%Y".
|
||||
exact (bool): If True, require an exact format match.
|
||||
unit (string, default 'ns'): unit of the arg.
|
||||
infer_datetime_format (bool): Whether or not to infer the format.
|
||||
origin (string): Define the reference date.
|
||||
|
||||
Returns:
|
||||
Type depends on input:
|
||||
|
||||
- list-like: DatetimeIndex
|
||||
- Series: Series of datetime64 dtype
|
||||
- scalar: Timestamp
|
||||
"""
|
||||
if not isinstance(arg, DataFrame):
|
||||
return pandas.to_datetime(arg, errors=errors, dayfirst=dayfirst,
|
||||
yearfirst=yearfirst, utc=utc, box=box,
|
||||
format=format, exact=exact, unit=unit,
|
||||
infer_datetime_format=infer_datetime_format,
|
||||
origin=origin)
|
||||
if errors == 'raise':
|
||||
pandas.to_datetime(pandas.DataFrame(columns=arg.columns),
|
||||
errors=errors, dayfirst=dayfirst,
|
||||
yearfirst=yearfirst, utc=utc, box=box,
|
||||
format=format, exact=exact, unit=unit,
|
||||
infer_datetime_format=infer_datetime_format,
|
||||
origin=origin)
|
||||
|
||||
def datetime_helper(df, cols):
|
||||
df.columns = cols
|
||||
return pandas.to_datetime(df, errors=errors, dayfirst=dayfirst,
|
||||
yearfirst=yearfirst, utc=utc, box=box,
|
||||
format=format, exact=exact, unit=unit,
|
||||
infer_datetime_format=infer_datetime_format,
|
||||
origin=origin)
|
||||
|
||||
datetime_series = _map_partitions(datetime_helper, arg._row_partitions,
|
||||
arg.columns)
|
||||
result = pandas.concat(ray.get(datetime_series), copy=False)
|
||||
result.index = arg.index
|
||||
|
||||
return result
|
||||
@@ -1,581 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pandas
|
||||
import numpy as np
|
||||
import pandas.core.groupby
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
import pandas.core.common as com
|
||||
|
||||
import ray
|
||||
|
||||
from .utils import _inherit_docstrings, _reindex_helper
|
||||
from .concat import concat
|
||||
from .index_metadata import _IndexMetadata
|
||||
|
||||
|
||||
@_inherit_docstrings(pandas.core.groupby.DataFrameGroupBy,
|
||||
excluded=[pandas.core.groupby.DataFrameGroupBy,
|
||||
pandas.core.groupby.DataFrameGroupBy.__init__])
|
||||
class DataFrameGroupBy(object):
|
||||
|
||||
def __init__(self, df, by, axis, level, as_index, sort, group_keys,
|
||||
squeeze, **kwargs):
|
||||
|
||||
self._columns = df.columns
|
||||
self._index = df.index
|
||||
self._axis = axis
|
||||
|
||||
self._row_metadata = df._row_metadata
|
||||
self._col_metadata = df._col_metadata
|
||||
|
||||
if axis == 0:
|
||||
partitions = [column for column in df._block_partitions.T]
|
||||
self._index_grouped = \
|
||||
pandas.Series(self._index, index=self._index) \
|
||||
.groupby(by=by, sort=sort)
|
||||
else:
|
||||
partitions = [row for row in df._block_partitions]
|
||||
self._index_grouped = \
|
||||
pandas.Series(self._columns, index=self._columns) \
|
||||
.groupby(by=by, sort=sort)
|
||||
|
||||
self._keys_and_values = [(k, v)
|
||||
for k, v in self._index_grouped]
|
||||
|
||||
if len(self) > 1:
|
||||
self._grouped_partitions = \
|
||||
list(zip(*(groupby._submit(args=(by,
|
||||
axis,
|
||||
level,
|
||||
as_index,
|
||||
sort,
|
||||
group_keys,
|
||||
squeeze)
|
||||
+ tuple(part.tolist()),
|
||||
num_return_vals=len(self))
|
||||
for part in partitions)))
|
||||
else:
|
||||
if axis == 0:
|
||||
self._grouped_partitions = [df._col_partitions]
|
||||
else:
|
||||
self._grouped_partitions = [df._row_partitions]
|
||||
|
||||
def __getattr__(self, key):
|
||||
"""Afer regular attribute access, looks up the name in the columns
|
||||
|
||||
Args:
|
||||
key (str): Attribute name.
|
||||
|
||||
Returns:
|
||||
The value of the attribute.
|
||||
"""
|
||||
try:
|
||||
return object.__getattribute__(self, key)
|
||||
except AttributeError as e:
|
||||
if key in self._columns:
|
||||
raise NotImplementedError(
|
||||
"SeriesGroupBy is not implemented."
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
raise e
|
||||
|
||||
@property
|
||||
def _iter(self):
|
||||
from .dataframe import DataFrame
|
||||
|
||||
if self._axis == 0:
|
||||
return [(self._keys_and_values[i][0],
|
||||
DataFrame(col_partitions=part,
|
||||
columns=self._columns,
|
||||
index=self._keys_and_values[i][1].index,
|
||||
col_metadata=self._col_metadata))
|
||||
for i, part in enumerate(self._grouped_partitions)]
|
||||
else:
|
||||
return [(self._keys_and_values[i][0],
|
||||
DataFrame(row_partitions=part,
|
||||
columns=self._keys_and_values[i][1].index,
|
||||
index=self._index,
|
||||
row_metadata=self._row_metadata))
|
||||
for i, part in enumerate(self._grouped_partitions)]
|
||||
|
||||
@property
|
||||
def ngroups(self):
|
||||
return len(self)
|
||||
|
||||
def skew(self, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.skew(axis=self._axis,
|
||||
**kwargs))
|
||||
|
||||
def ffill(self, limit=None):
|
||||
return self._apply_df_function(lambda df: df.ffill(axis=self._axis,
|
||||
limit=limit))
|
||||
|
||||
def sem(self, ddof=1):
|
||||
return self._apply_agg_function(lambda df: df.sem(axis=self._axis,
|
||||
ddof=ddof))
|
||||
|
||||
def mean(self, *args, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.mean(axis=self._axis,
|
||||
*args,
|
||||
**kwargs))
|
||||
|
||||
def any(self):
|
||||
return self._apply_agg_function(lambda df: df.any(axis=self._axis))
|
||||
|
||||
@property
|
||||
def plot(self):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def ohlc(self):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def __bytes__(self):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
@property
|
||||
def tshift(self):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
@property
|
||||
def groups(self):
|
||||
return {k: pandas.Index(v) for k, v in self._keys_and_values}
|
||||
|
||||
def min(self, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.min(axis=self._axis,
|
||||
**kwargs))
|
||||
|
||||
def idxmax(self):
|
||||
def idxmax_helper(df, index):
|
||||
result = df.idxmax(axis=self._axis)
|
||||
result = result.apply(lambda v: index[v])
|
||||
return result
|
||||
|
||||
results = [idxmax_helper(g[1], i[1])
|
||||
for g, i in zip(self._iter, self._index_grouped)]
|
||||
|
||||
new_df = concat(results, axis=1)
|
||||
if self._axis == 0:
|
||||
new_df = new_df.T
|
||||
new_df.columns = self._columns
|
||||
new_df.index = [k for k, v in self._iter]
|
||||
else:
|
||||
new_df.columns = [k for k, v in self._iter]
|
||||
new_df.index = self._index
|
||||
return new_df
|
||||
|
||||
@property
|
||||
def ndim(self):
|
||||
return 2 # ndim is always 2 for DataFrames
|
||||
|
||||
def shift(self, periods=1, freq=None, axis=0):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def nth(self, n, dropna=None):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def cumsum(self, axis=0, *args, **kwargs):
|
||||
return self._apply_df_function(lambda df: df.cumsum(axis,
|
||||
*args,
|
||||
**kwargs))
|
||||
|
||||
@property
|
||||
def indices(self):
|
||||
return dict(self._keys_and_values)
|
||||
|
||||
def pct_change(self):
|
||||
return self._apply_agg_function(
|
||||
lambda df: df.pct_change(axis=self._axis))
|
||||
|
||||
def filter(self, func, dropna=True, *args, **kwargs):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def cummax(self, axis=0, **kwargs):
|
||||
return self._apply_df_function(lambda df: df.cummax(axis,
|
||||
**kwargs))
|
||||
|
||||
def apply(self, func, *args, **kwargs):
|
||||
def apply_helper(df):
|
||||
return df.apply(func, axis=self._axis, *args, **kwargs)
|
||||
|
||||
result = [func(v) for k, v in self._iter]
|
||||
if self._axis == 0:
|
||||
if isinstance(result[0], pandas.Series):
|
||||
# Applied an aggregation function
|
||||
new_df = concat(result, axis=1).T
|
||||
new_df.columns = self._columns
|
||||
new_df.index = [k for k, v in self._iter]
|
||||
else:
|
||||
new_df = concat(result, axis=self._axis)
|
||||
new_df._block_partitions = np.array([_reindex_helper._submit(
|
||||
args=tuple([new_df.index, self._index, self._axis ^ 1,
|
||||
len(new_df._block_partitions)]
|
||||
+ block.tolist()),
|
||||
num_return_vals=len(new_df._block_partitions))
|
||||
for block in new_df._block_partitions.T]).T
|
||||
new_df.index = self._index
|
||||
new_df._row_metadata = \
|
||||
_IndexMetadata(new_df._block_partitions[:, 0],
|
||||
index=new_df.index, axis=0)
|
||||
else:
|
||||
if isinstance(result[0], pandas.Series):
|
||||
# Applied an aggregation function
|
||||
new_df = concat(result, axis=1)
|
||||
new_df.columns = [k for k, v in self._iter]
|
||||
new_df.index = self._index
|
||||
else:
|
||||
new_df = concat(result, axis=self._axis)
|
||||
new_df._block_partitions = np.array([_reindex_helper._submit(
|
||||
args=tuple([new_df.columns, self._columns, self._axis ^ 1,
|
||||
new_df._block_partitions.shape[1]]
|
||||
+ block.tolist()),
|
||||
num_return_vals=new_df._block_partitions.shape[1])
|
||||
for block in new_df._block_partitions])
|
||||
new_df.columns = self._columns
|
||||
new_df._col_metadata = \
|
||||
_IndexMetadata(new_df._block_partitions[0, :],
|
||||
index=new_df.columns, axis=1)
|
||||
return new_df
|
||||
|
||||
@property
|
||||
def dtypes(self):
|
||||
if self._axis == 1:
|
||||
raise ValueError("Cannot call dtypes on groupby with axis=1")
|
||||
return self._apply_agg_function(lambda df: df.dtypes)
|
||||
|
||||
def first(self, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.first(offset=0,
|
||||
**kwargs))
|
||||
|
||||
def backfill(self, limit=None):
|
||||
return self.bfill(limit)
|
||||
|
||||
def __getitem__(self, key):
|
||||
# This operation requires a SeriesGroupBy Object
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def cummin(self, axis=0, **kwargs):
|
||||
return self._apply_df_function(lambda df: df.cummin(axis=axis,
|
||||
**kwargs))
|
||||
|
||||
def bfill(self, limit=None):
|
||||
return self._apply_df_function(lambda df: df.bfill(axis=self._axis,
|
||||
limit=limit))
|
||||
|
||||
def idxmin(self):
|
||||
def idxmin_helper(df, index):
|
||||
result = df.idxmin(axis=self._axis)
|
||||
result = result.apply(lambda v: index[v])
|
||||
return result
|
||||
|
||||
results = [idxmin_helper(g[1], i[1])
|
||||
for g, i in zip(self._iter, self._index_grouped)]
|
||||
|
||||
new_df = concat(results, axis=1)
|
||||
if self._axis == 0:
|
||||
new_df = new_df.T
|
||||
new_df.columns = self._columns
|
||||
new_df.index = [k for k, v in self._iter]
|
||||
else:
|
||||
new_df.columns = [k for k, v in self._iter]
|
||||
new_df.index = self._index
|
||||
return new_df
|
||||
|
||||
def prod(self, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.prod(axis=self._axis,
|
||||
**kwargs))
|
||||
|
||||
def std(self, ddof=1, *args, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.std(axis=self._axis,
|
||||
ddof=ddof,
|
||||
*args,
|
||||
**kwargs))
|
||||
|
||||
def aggregate(self, arg, *args, **kwargs):
|
||||
if self._axis != 0:
|
||||
# This is not implemented in pandas,
|
||||
# so we throw a different message
|
||||
raise NotImplementedError("axis other than 0 is not supported")
|
||||
|
||||
if is_list_like(arg):
|
||||
raise NotImplementedError(
|
||||
"This requires Multi-level index to be implemented. "
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
return self._apply_agg_function(lambda df: df.agg(arg,
|
||||
axis=self._axis,
|
||||
*args,
|
||||
**kwargs))
|
||||
|
||||
def last(self, **kwargs):
|
||||
return self._apply_df_function(lambda df: df.last(offset=0,
|
||||
**kwargs))
|
||||
|
||||
def mad(self):
|
||||
return self._apply_agg_function(lambda df: df.mad())
|
||||
|
||||
def rank(self):
|
||||
return self._apply_df_function(lambda df: df.rank(axis=self._axis))
|
||||
|
||||
@property
|
||||
def corrwith(self):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def pad(self, limit=None):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def max(self, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.max(axis=self._axis,
|
||||
**kwargs))
|
||||
|
||||
def var(self, ddof=1, *args, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.var(ddof=ddof,
|
||||
axis=self._axis,
|
||||
*args,
|
||||
**kwargs))
|
||||
|
||||
def get_group(self, name, obj=None):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def __len__(self):
|
||||
return len(self._keys_and_values)
|
||||
|
||||
def all(self):
|
||||
return self._apply_agg_function(lambda df: df.all())
|
||||
|
||||
def size(self):
|
||||
return self._apply_agg_function(lambda df: df.size)
|
||||
|
||||
def sum(self, **kwargs):
|
||||
return self._apply_agg_function(lambda df:
|
||||
df.sum(axis=self._axis, **kwargs))
|
||||
|
||||
def __unicode__(self):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def describe(self, **kwargs):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def boxplot(self, grouped, subplots=True, column=None, fontsize=None,
|
||||
rot=0, grid=True, ax=None, figsize=None, layout=None, **kwds):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def ngroup(self, ascending=True):
|
||||
return self._index_grouped.ngroup(ascending)
|
||||
|
||||
def nunique(self, dropna=True):
|
||||
return self._apply_agg_function(lambda df: df.nunique(dropna=dropna,
|
||||
axis=self._axis))
|
||||
|
||||
def resample(self, rule, *args, **kwargs):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def median(self, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.median(axis=self._axis,
|
||||
**kwargs))
|
||||
|
||||
def head(self, n=5):
|
||||
result = [v.head(n) for k, v in self._iter]
|
||||
new_df = concat(result, axis=self._axis)
|
||||
|
||||
if self._axis == 0:
|
||||
index_head = [v[:n] for k, v in self._keys_and_values]
|
||||
flattened_index = {i for j in index_head for i in j}
|
||||
sorted_index = [i for i in self._index if i in flattened_index]
|
||||
new_df._block_partitions = np.array([_reindex_helper._submit(
|
||||
args=tuple([new_df.index, sorted_index, 1,
|
||||
len(new_df._block_partitions)] + block.tolist()),
|
||||
num_return_vals=len(new_df._block_partitions))
|
||||
for block in new_df._block_partitions.T]).T
|
||||
new_df.index = sorted_index
|
||||
new_df._row_metadata = \
|
||||
_IndexMetadata(new_df._block_partitions[:, 0],
|
||||
index=new_df.index, axis=0)
|
||||
|
||||
return new_df
|
||||
|
||||
def cumprod(self, axis=0, *args, **kwargs):
|
||||
return self._apply_df_function(lambda df: df.cumprod(axis,
|
||||
*args,
|
||||
**kwargs))
|
||||
|
||||
def __iter__(self):
|
||||
return self._iter.__iter__()
|
||||
|
||||
def agg(self, arg, *args, **kwargs):
|
||||
return self.aggregate(arg, *args, **kwargs)
|
||||
|
||||
def cov(self):
|
||||
return self._apply_agg_function(lambda df: df.cov())
|
||||
|
||||
def transform(self, func, *args, **kwargs):
|
||||
return self._apply_df_function(lambda df: df.transform(func,
|
||||
*args,
|
||||
**kwargs))
|
||||
|
||||
def corr(self, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.corr(**kwargs))
|
||||
|
||||
def fillna(self, **kwargs):
|
||||
return self._apply_df_function(lambda df: df.fillna(axis=self._axis,
|
||||
**kwargs))
|
||||
|
||||
def count(self, **kwargs):
|
||||
return self._apply_agg_function(lambda df: df.count(self._axis,
|
||||
**kwargs))
|
||||
|
||||
def pipe(self, func, *args, **kwargs):
|
||||
return com._pipe(self, func, *args, **kwargs)
|
||||
|
||||
def cumcount(self, ascending=True):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def tail(self, n=5):
|
||||
result = [v.tail(n) for k, v in self._iter]
|
||||
new_df = concat(result, axis=self._axis)
|
||||
|
||||
if self._axis == 0:
|
||||
index_tail = [v[-n:] for k, v in self._keys_and_values]
|
||||
flattened_index = {i for j in index_tail for i in j}
|
||||
sorted_index = [i for i in self._index if i in flattened_index]
|
||||
new_df._block_partitions = np.array([_reindex_helper._submit(
|
||||
args=tuple([new_df.index, sorted_index, 1,
|
||||
len(new_df._block_partitions)] + block.tolist()),
|
||||
num_return_vals=len(new_df._block_partitions))
|
||||
for block in new_df._block_partitions.T]).T
|
||||
new_df.index = sorted_index
|
||||
new_df._row_metadata = \
|
||||
_IndexMetadata(new_df._block_partitions[:, 0],
|
||||
index=new_df.index, axis=0)
|
||||
|
||||
return new_df
|
||||
|
||||
# expanding and rolling are unique cases and need to likely be handled
|
||||
# separately. They do not appear to be commonly used.
|
||||
def expanding(self, *args, **kwargs):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def rolling(self, *args, **kwargs):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def hist(self):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def quantile(self, q=0.5, **kwargs):
|
||||
if is_list_like(q):
|
||||
raise NotImplementedError(
|
||||
"This requires Multi-level index to be implemented. "
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
return self._apply_agg_function(lambda df: df.quantile(q=q,
|
||||
axis=self._axis,
|
||||
**kwargs))
|
||||
|
||||
def diff(self):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def take(self, **kwargs):
|
||||
return self._apply_df_function(lambda df: df.take(**kwargs))
|
||||
|
||||
def _apply_agg_function(self, f):
|
||||
assert callable(f), "\'{0}\' object is not callable".format(type(f))
|
||||
|
||||
result = [f(v) for k, v in self._iter]
|
||||
new_df = concat(result, axis=1)
|
||||
|
||||
if self._axis == 0:
|
||||
new_df = new_df.T
|
||||
new_df.columns = self._columns
|
||||
new_df.index = [k for k, v in self._iter]
|
||||
else:
|
||||
new_df.columns = [k for k, v in self._iter]
|
||||
new_df.index = self._index
|
||||
return new_df
|
||||
|
||||
def _apply_df_function(self, f, concat_axis=None):
|
||||
assert callable(f), "\'{0}\' object is not callable".format(type(f))
|
||||
|
||||
result = [f(v) for k, v in self._iter]
|
||||
concat_axis = self._axis if concat_axis is None else concat_axis
|
||||
|
||||
new_df = concat(result, axis=concat_axis)
|
||||
|
||||
if self._axis == 0:
|
||||
new_df._block_partitions = np.array([_reindex_helper._submit(
|
||||
args=tuple([new_df.index, self._index, 1,
|
||||
len(new_df._block_partitions)] + block.tolist()),
|
||||
num_return_vals=len(new_df._block_partitions))
|
||||
for block in new_df._block_partitions.T]).T
|
||||
new_df.index = self._index
|
||||
new_df._row_metadata = \
|
||||
_IndexMetadata(new_df._block_partitions[:, 0],
|
||||
index=new_df.index, axis=0)
|
||||
else:
|
||||
new_df._block_partitions = np.array([_reindex_helper._submit(
|
||||
args=tuple([new_df.columns, self._columns, 0,
|
||||
new_df._block_partitions.shape[1]]
|
||||
+ block.tolist()),
|
||||
num_return_vals=new_df._block_partitions.shape[1])
|
||||
for block in new_df._block_partitions])
|
||||
new_df.columns = self._columns
|
||||
new_df._col_metadata = \
|
||||
_IndexMetadata(new_df._block_partitions[0, :],
|
||||
index=new_df.columns, axis=1)
|
||||
|
||||
return new_df
|
||||
|
||||
|
||||
@ray.remote
|
||||
def groupby(by, axis, level, as_index, sort, group_keys, squeeze, *df):
|
||||
|
||||
df = pandas.concat(df, axis=axis)
|
||||
|
||||
return [v for k, v in df.groupby(by=by,
|
||||
axis=axis,
|
||||
level=level,
|
||||
as_index=as_index,
|
||||
sort=sort,
|
||||
group_keys=group_keys,
|
||||
squeeze=squeeze)]
|
||||
@@ -1,415 +0,0 @@
|
||||
import pandas
|
||||
import numpy as np
|
||||
import ray
|
||||
|
||||
from .utils import (
|
||||
_build_row_lengths,
|
||||
_build_col_widths,
|
||||
_build_coord_df)
|
||||
|
||||
from pandas.core.indexing import convert_to_index_sliceable
|
||||
|
||||
|
||||
class _IndexMetadata(object):
|
||||
"""Wrapper for Pandas indexes in Ray DataFrames. Handles all of the
|
||||
metadata specific to the axis of partition (setting indexes,
|
||||
calculating the index within partition of a value, etc.). This
|
||||
implementation assumes the underlying index lies across multiple
|
||||
partitions.
|
||||
|
||||
IMPORTANT NOTE: Currently all operations, as implemented, are inplace.
|
||||
|
||||
WARNING: Currently, the `_lengths` item is the source of truth for an
|
||||
_IndexMetadata object, since it is easy to manage, and that the coord_df
|
||||
item may be deprecated in the future. As such, it is _very_ important that
|
||||
any functions that mutate the coord_df splits in anyway first modify the
|
||||
lengths. Otherwise bad things might happen!
|
||||
"""
|
||||
|
||||
def __init__(self, dfs=None, index=None, axis=0, lengths_oid=None,
|
||||
coord_df_oid=None):
|
||||
"""Inits a IndexMetadata from Ray DataFrame partitions
|
||||
|
||||
Args:
|
||||
dfs ([ObjectID]): ObjectIDs of dataframe partitions
|
||||
index (pandas.Index): Index of the Ray DataFrame.
|
||||
axis: Axis of partition (0=row partitions, 1=column partitions)
|
||||
|
||||
Returns:
|
||||
A IndexMetadata backed by the specified pandas.Index, partitioned
|
||||
off specified partitions
|
||||
"""
|
||||
assert (lengths_oid is None) == (coord_df_oid is None), \
|
||||
"Must pass both or neither of lengths_oid and coord_df_oid"
|
||||
|
||||
if dfs is not None and lengths_oid is None:
|
||||
if axis == 0:
|
||||
lengths_oid = _build_row_lengths.remote(dfs)
|
||||
else:
|
||||
lengths_oid = _build_col_widths.remote(dfs)
|
||||
coord_df_oid = _build_coord_df.remote(lengths_oid, index)
|
||||
|
||||
self._lengths = lengths_oid
|
||||
self._coord_df = coord_df_oid
|
||||
self._index_cache = index
|
||||
self._cached_index = False
|
||||
|
||||
def _get__lengths(self):
|
||||
if isinstance(self._lengths_cache, ray.ObjectID) or \
|
||||
(isinstance(self._lengths_cache, list) and
|
||||
isinstance(self._lengths_cache[0], ray.ObjectID)):
|
||||
self._lengths_cache = ray.get(self._lengths_cache)
|
||||
return self._lengths_cache
|
||||
|
||||
def _set__lengths(self, lengths):
|
||||
self._lengths_cache = lengths
|
||||
|
||||
_lengths = property(_get__lengths, _set__lengths)
|
||||
|
||||
def _get__coord_df(self):
|
||||
"""Get the coordinate dataframe wrapped by this _IndexMetadata.
|
||||
|
||||
Since we may have had an index set before our coord_df was
|
||||
materialized, we'll have to apply it to the newly materialized df
|
||||
"""
|
||||
if isinstance(self._coord_df_cache, ray.ObjectID):
|
||||
self._coord_df_cache = ray.get(self._coord_df_cache)
|
||||
if self._cached_index:
|
||||
self._coord_df_cache.index = self._index_cache
|
||||
self._cached_index = False
|
||||
return self._coord_df_cache
|
||||
|
||||
def _set__coord_df(self, coord_df):
|
||||
"""Set the coordinate dataframe wrapped by this _IndexMetadata.
|
||||
|
||||
Sometimes we set the _IndexMetadata's coord_df outside of the
|
||||
constructor, generally using fxns like drop(). This produces a modified
|
||||
index, so we need to reflect the change on the index cache.
|
||||
|
||||
If the set _IndexMetadata is an OID instead (due to a copy or whatever
|
||||
reason), we fall back relying on `_index_cache`.
|
||||
"""
|
||||
if not isinstance(coord_df, ray.ObjectID):
|
||||
self._index_cache = coord_df.index
|
||||
self._coord_df_cache = coord_df
|
||||
|
||||
_coord_df = property(_get__coord_df, _set__coord_df)
|
||||
|
||||
def _get_index(self):
|
||||
"""Get the index wrapped by this _IndexMetadata.
|
||||
|
||||
The only time `self._index_cache` would be None is in a newly created
|
||||
_IndexMetadata object without a specified `index` parameter (See the
|
||||
_IndexMetadata constructor for more details)
|
||||
"""
|
||||
if isinstance(self._coord_df_cache, ray.ObjectID):
|
||||
return self._index_cache
|
||||
else:
|
||||
return self._coord_df_cache.index
|
||||
|
||||
def _set_index(self, new_index):
|
||||
"""Set the index wrapped by this _IndexMetadata.
|
||||
|
||||
It is important to always set `_index_cache` even if the coord_df is
|
||||
materialized due to the possibility that it is set to an OID later on.
|
||||
This design is more straightforward than caching indexes on setting the
|
||||
coord_df to an OID due to the possibility of an OID-to-OID change.
|
||||
"""
|
||||
new_index = pandas.DataFrame(index=new_index).index
|
||||
assert len(new_index) == len(self)
|
||||
|
||||
self._index_cache = new_index
|
||||
if isinstance(self._coord_df_cache, ray.ObjectID):
|
||||
self._cached_index = True
|
||||
else:
|
||||
self._coord_df_cache.index = new_index
|
||||
|
||||
index = property(_get_index, _set_index)
|
||||
|
||||
def _get_index_cache(self):
|
||||
"""Get the cached Index object, which may sometimes be an OID.
|
||||
|
||||
This will ray.get the Index object out of the Ray store lazily, such
|
||||
that it is not grabbed until it is needed in the driver. This layer of
|
||||
abstraction is important for allowing this object to be instantiated
|
||||
with a remote Index object.
|
||||
|
||||
Returns:
|
||||
The Index object in _index_cache.
|
||||
"""
|
||||
if self._index_cache_validator is None:
|
||||
self._index_cache_validator = pandas.RangeIndex(len(self))
|
||||
elif isinstance(self._index_cache_validator,
|
||||
ray.ObjectID):
|
||||
self._index_cache_validator = ray.get(self._index_cache_validator)
|
||||
|
||||
return self._index_cache_validator
|
||||
|
||||
def _set_index_cache(self, new_index):
|
||||
"""Sets the new index cache.
|
||||
|
||||
Args:
|
||||
new_index: The Index to set the _index_cache to.
|
||||
"""
|
||||
self._index_cache_validator = new_index
|
||||
|
||||
# _index_cache_validator is an extra layer of abstraction to allow the
|
||||
# cache to accept ObjectIDs and ray.get them when needed.
|
||||
_index_cache = property(_get_index_cache, _set_index_cache)
|
||||
|
||||
def coords_of(self, key):
|
||||
"""Returns the coordinates (partition, index_within_partition) of the
|
||||
provided key in the index. Can be called on its own or implicitly
|
||||
through __getitem__
|
||||
|
||||
Args:
|
||||
key:
|
||||
item to get coordinates of. Can also be a tuple of item
|
||||
and {"partition", "index_within_partition"} if caller only
|
||||
needs one of the coordinates
|
||||
|
||||
Returns:
|
||||
Pandas object with the keys specified. If key is a single object
|
||||
it will be a pandas.Series with items `partition` and
|
||||
`index_within_partition`, and if key is a slice or if the key is
|
||||
duplicate it will be a pandas.DataFrame with said items as columns.
|
||||
"""
|
||||
return self._coord_df.loc[key]
|
||||
|
||||
def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
|
||||
group_keys=True, squeeze=False, **kwargs):
|
||||
# TODO: Find out what this does, and write a docstring
|
||||
assignments_df = self._coord_df.groupby(by=by, axis=axis, level=level,
|
||||
as_index=as_index, sort=sort,
|
||||
group_keys=group_keys,
|
||||
squeeze=squeeze, **kwargs)\
|
||||
.apply(lambda x: x[:])
|
||||
return assignments_df
|
||||
|
||||
def partition_series(self, partition):
|
||||
return self[self._coord_df['partition'] == partition,
|
||||
'index_within_partition']
|
||||
|
||||
def __len__(self):
|
||||
return int(sum(self._lengths))
|
||||
|
||||
def reset_partition_coords(self, partitions=None):
|
||||
partitions = np.array(partitions)
|
||||
|
||||
for partition in partitions:
|
||||
partition_mask = (self._coord_df['partition'] == partition)
|
||||
# Since we are replacing columns with RangeIndex inside the
|
||||
# partition, we have to make sure that our reference to it is
|
||||
# upandasated as well.
|
||||
try:
|
||||
self._coord_df.loc[partition_mask,
|
||||
'index_within_partition'] = np.arange(
|
||||
sum(partition_mask)).astype(int)
|
||||
except ValueError:
|
||||
# Copy the arrow sealed dataframe so we can mutate it.
|
||||
# We only do this the first time we try to mutate the sealed.
|
||||
self._coord_df = self._coord_df.copy()
|
||||
self._coord_df.loc[partition_mask,
|
||||
'index_within_partition'] = np.arange(
|
||||
sum(partition_mask)).astype(int)
|
||||
|
||||
def insert(self, key, loc=None, partition=None,
|
||||
index_within_partition=None):
|
||||
"""Inserts a key at a certain location in the index, or a certain coord
|
||||
in a partition. Called with either `loc` or `partition` and
|
||||
`index_within_partition`. If called with both, `loc` will be used.
|
||||
|
||||
Args:
|
||||
key: item to insert into index
|
||||
loc: location to insert into index
|
||||
partition: partition to insert into
|
||||
index_within_partition: index within partition to insert into
|
||||
|
||||
Returns:
|
||||
DataFrame with coordinates of insert
|
||||
"""
|
||||
# Perform insert on a specific partition
|
||||
# Determine which partition to place it in, and where in that partition
|
||||
if loc is not None:
|
||||
cum_lens = np.cumsum(self._lengths)
|
||||
if len(cum_lens) > 1:
|
||||
partition = np.digitize(loc, cum_lens[:-1], right=True)
|
||||
else:
|
||||
partition = 0
|
||||
if partition >= len(cum_lens):
|
||||
if loc > cum_lens[-1]:
|
||||
raise IndexError("index {0} is out of bounds".format(loc))
|
||||
else:
|
||||
index_within_partition = self._lengths[-1]
|
||||
else:
|
||||
first_in_partition = \
|
||||
np.asscalar(np.concatenate(([0], cum_lens))[partition])
|
||||
index_within_partition = loc - first_in_partition
|
||||
|
||||
# TODO: Stop-gap solution until we begin passing IndexMetadatas
|
||||
return partition, index_within_partition
|
||||
|
||||
# Generate new index
|
||||
new_index = self.index.insert(loc, key)
|
||||
|
||||
# Shift indices in partition where we inserted column
|
||||
idx_locs = (self._coord_df.partition == partition) & \
|
||||
(self._coord_df.index_within_partition ==
|
||||
index_within_partition)
|
||||
# TODO: Determine why self._coord_df{,_cache} are read-only
|
||||
_coord_df_copy = self._coord_df.copy()
|
||||
_coord_df_copy.loc[idx_locs, 'index_within_partition'] += 1
|
||||
|
||||
# TODO: Determine if there's a better way to do a row-index insert in
|
||||
# pandas, because this is very annoying/unsure of efficiency
|
||||
# Create new coord entry to insert
|
||||
coord_to_insert = pandas.DataFrame(
|
||||
{'partition': partition,
|
||||
'index_within_partition': index_within_partition},
|
||||
index=[key])
|
||||
|
||||
# Insert into cached RangeIndex, and order by new column index
|
||||
self._coord_df = _coord_df_copy.append(coord_to_insert).loc[new_index]
|
||||
|
||||
# Return inserted coordinate for callee
|
||||
return coord_to_insert
|
||||
|
||||
def get_global_indices(self, partition, index_within_partition_list):
|
||||
total = 0
|
||||
for i in range(partition):
|
||||
total += self._lengths[i]
|
||||
|
||||
return [total + i for i in index_within_partition_list]
|
||||
|
||||
def squeeze(self, partition, index_within_partition):
|
||||
"""Prepare a single coordinate for removal by "squeezing" the
|
||||
subsequent coordinates "up" one index within that partition. To be used
|
||||
with "_IndexMetadata.drop" for when all the "squeezed" coordinates are
|
||||
dropped in batch. Note that this function doesn't actually mutate the
|
||||
coord_df.
|
||||
"""
|
||||
self._coord_df = self._coord_df.copy()
|
||||
|
||||
partition_mask = self._coord_df.partition == partition
|
||||
index_within_partition_mask = \
|
||||
self._coord_df.index_within_partition > index_within_partition
|
||||
self._coord_df.loc[partition_mask & index_within_partition_mask,
|
||||
'index_within_partition'] -= 1
|
||||
|
||||
def copy(self):
|
||||
# TODO: Investigate copy-on-write wrapper for metadata objects
|
||||
coord_df_copy = self._coord_df_cache
|
||||
if not isinstance(self._coord_df_cache, ray.ObjectID):
|
||||
coord_df_copy = self._coord_df_cache.copy()
|
||||
|
||||
lengths_copy = self._lengths_cache
|
||||
if not isinstance(self._lengths_cache, ray.ObjectID):
|
||||
lengths_copy = self._lengths_cache.copy()
|
||||
|
||||
index_copy = self._index_cache
|
||||
if self._index_cache is not None:
|
||||
index_copy = self._index_cache.copy()
|
||||
|
||||
return _IndexMetadata(index=index_copy,
|
||||
coord_df_oid=coord_df_copy,
|
||||
lengths_oid=lengths_copy)
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""Returns the coordinates (partition, index_within_partition) of the
|
||||
provided key in the index. Essentially just an alias for
|
||||
`_IndexMetadata.coords_of` that allows for slice passing, since
|
||||
slices cannot be passed with slice notation other than through
|
||||
`__getitem__` calls.
|
||||
|
||||
Args:
|
||||
key:
|
||||
item to get coordinates of. Can also be a tuple of item
|
||||
and {"partition", "index_within_partition"} if caller only
|
||||
needs one of the coordinates
|
||||
|
||||
Returns:
|
||||
Pandas object with the keys specified. If key is a single object
|
||||
it will be a pandas.Series with items `partition` and
|
||||
`index_within_partition`, and if key is a slice or if the key is
|
||||
duplicate it will be a pandas.DataFrame with said items as columns.
|
||||
"""
|
||||
return self.coords_of(key)
|
||||
|
||||
def first_valid_index(self):
|
||||
return self._coord_df.first_valid_index()
|
||||
|
||||
def last_valid_index(self):
|
||||
return self._coord_df.last_valid_index()
|
||||
|
||||
def drop(self, labels, errors='raise'):
|
||||
"""Drop the specified labels from the IndexMetadata
|
||||
|
||||
Args:
|
||||
labels (scalar or list-like):
|
||||
The labels to drop
|
||||
errors ('raise' or 'ignore'):
|
||||
If 'ignore', suppress errors for when labels don't exist
|
||||
|
||||
Returns:
|
||||
DataFrame with coordinates of dropped labels
|
||||
"""
|
||||
dropped = self.coords_of(labels)
|
||||
|
||||
# Upandasate first lengths to prevent possible length inconsistencies
|
||||
if isinstance(dropped, pandas.DataFrame):
|
||||
try:
|
||||
drop_per_part = dropped.groupby(["partition"]).size()\
|
||||
.reindex(index=pandas.RangeIndex(len(self._lengths)),
|
||||
fill_value=0)
|
||||
except ValueError:
|
||||
# Copy the arrow sealed dataframe so we can mutate it.
|
||||
dropped = dropped.copy()
|
||||
drop_per_part = dropped.groupby(["partition"]).size()\
|
||||
.reindex(index=pandas.RangeIndex(len(self._lengths)),
|
||||
fill_value=0)
|
||||
elif isinstance(dropped, pandas.Series):
|
||||
drop_per_part = np.zeros_like(self._lengths)
|
||||
drop_per_part[dropped["partition"]] = 1
|
||||
else:
|
||||
raise AssertionError("Unrecognized result from `coords_of`")
|
||||
|
||||
self._lengths = self._lengths - np.array(drop_per_part)
|
||||
|
||||
new_coord_df = self._coord_df.drop(labels, errors=errors)
|
||||
|
||||
num_dropped = 0
|
||||
for i, length in enumerate(self._lengths):
|
||||
if length == 0:
|
||||
num_dropped += 1
|
||||
if num_dropped > 0:
|
||||
new_coord_df['partition'][new_coord_df['partition'] == i] \
|
||||
-= num_dropped
|
||||
|
||||
self._coord_df = new_coord_df
|
||||
return dropped
|
||||
|
||||
def rename_index(self, mapper):
|
||||
"""Rename the index.
|
||||
|
||||
Args:
|
||||
mapper: name to rename the index as
|
||||
"""
|
||||
self._coord_df = self._coord_df.rename_axis(mapper, axis=0)
|
||||
|
||||
def convert_to_index_sliceable(self, key):
|
||||
"""Converts and performs error checking on the passed slice
|
||||
|
||||
Args:
|
||||
key: slice to convert and check
|
||||
"""
|
||||
return convert_to_index_sliceable(self._coord_df, key)
|
||||
|
||||
def get_partition(self, partition_id):
|
||||
"""Return a view of coord_df where partition = partition_id
|
||||
"""
|
||||
return self._coord_df[self._coord_df.partition == partition_id]
|
||||
|
||||
def sorted_index(self):
|
||||
return (self._coord_df
|
||||
.sort_values(['partition', 'index_within_partition'])
|
||||
.index)
|
||||
@@ -1,469 +0,0 @@
|
||||
"""Indexing Helper Class works as follows:
|
||||
|
||||
_Location_Indexer_Base provide methods framework for __getitem__
|
||||
and __setitem__ that work with Ray DataFrame's internal index. Base
|
||||
class's __{get,set}item__ takes in partitions & idx_in_partition data
|
||||
and perform lookup/item write.
|
||||
|
||||
_LocIndexer and _iLocIndexer is responsible for indexer specific logic and
|
||||
lookup computation. Loc will take care of enlarge dataframe. Both indexer
|
||||
will take care of translating pandas's lookup to Ray DataFrame's internal
|
||||
lookup.
|
||||
|
||||
An illustration is available at
|
||||
https://github.com/ray-project/ray/pull/1955#issuecomment-386781826
|
||||
"""
|
||||
import pandas
|
||||
import numpy as np
|
||||
import ray
|
||||
from warnings import warn
|
||||
|
||||
from pandas.api.types import (is_scalar, is_list_like, is_bool)
|
||||
from pandas.core.dtypes.common import is_integer
|
||||
from pandas.core.indexing import IndexingError
|
||||
|
||||
from .utils import (_blocks_to_col, _get_nan_block_id, extractor,
|
||||
_mask_block_partitions, writer)
|
||||
from .index_metadata import _IndexMetadata
|
||||
from .dataframe import DataFrame
|
||||
|
||||
|
||||
def is_slice(x): return isinstance(x, slice)
|
||||
|
||||
|
||||
def is_2d(x): return is_list_like(x) or is_slice(x)
|
||||
|
||||
|
||||
def is_tuple(x): return isinstance(x, tuple)
|
||||
|
||||
|
||||
def is_boolean_array(x): return is_list_like(x) and all(map(is_bool, x))
|
||||
|
||||
|
||||
def is_integer_slice(x):
|
||||
if not is_slice(x):
|
||||
return False
|
||||
for pos in [x.start, x.stop, x.step]:
|
||||
if not ((pos is None) or is_integer(pos)):
|
||||
return False # one position is neither None nor int
|
||||
return True
|
||||
|
||||
|
||||
_ENLARGEMENT_WARNING = """
|
||||
Passing list-likes to .loc or [] with any missing label will raise
|
||||
KeyError in the future, you can use .reindex() as an alternative.
|
||||
|
||||
See the documentation here:
|
||||
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
|
||||
"""
|
||||
|
||||
_ILOC_INT_ONLY_ERROR = """
|
||||
Location based indexing can only have [integer, integer slice (START point is
|
||||
INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types.
|
||||
"""
|
||||
|
||||
|
||||
def _parse_tuple(tup):
|
||||
"""Unpack the user input for getitem and setitem and compute ndim
|
||||
|
||||
loc[a] -> ([a], :), 1D
|
||||
loc[[a,b],] -> ([a,b], :),
|
||||
loc[a,b] -> ([a], [b]), 0D
|
||||
"""
|
||||
row_loc, col_loc = slice(None), slice(None)
|
||||
|
||||
if is_tuple(tup):
|
||||
row_loc = tup[0]
|
||||
if len(tup) == 2:
|
||||
col_loc = tup[1]
|
||||
if len(tup) > 2:
|
||||
raise IndexingError('Too many indexers')
|
||||
else:
|
||||
row_loc = tup
|
||||
|
||||
ndim = _compute_ndim(row_loc, col_loc)
|
||||
row_loc = [row_loc] if is_scalar(row_loc) else row_loc
|
||||
col_loc = [col_loc] if is_scalar(col_loc) else col_loc
|
||||
|
||||
return row_loc, col_loc, ndim
|
||||
|
||||
|
||||
def _is_enlargement(locator, coord_df):
|
||||
"""Determine if a locator will enlarge the corrd_df.
|
||||
|
||||
Enlargement happens when you trying to locate using labels isn't in the
|
||||
original index. In other words, enlargement == adding NaNs !
|
||||
"""
|
||||
if is_list_like(locator) and not is_slice(
|
||||
locator) and len(locator) > 0 and not is_boolean_array(locator):
|
||||
n_diff_elems = len(pandas.Index(locator).difference(coord_df.index))
|
||||
is_enlargement_boolean = n_diff_elems > 0
|
||||
return is_enlargement_boolean
|
||||
return False
|
||||
|
||||
|
||||
def _warn_enlargement():
|
||||
warn(FutureWarning(_ENLARGEMENT_WARNING))
|
||||
|
||||
|
||||
def _compute_ndim(row_loc, col_loc):
|
||||
"""Compute the ndim of result from locators
|
||||
"""
|
||||
row_scaler = is_scalar(row_loc)
|
||||
col_scaler = is_scalar(col_loc)
|
||||
|
||||
if row_scaler and col_scaler:
|
||||
ndim = 0
|
||||
elif row_scaler ^ col_scaler:
|
||||
ndim = 1
|
||||
else:
|
||||
ndim = 2
|
||||
|
||||
return ndim
|
||||
|
||||
|
||||
class _Location_Indexer_Base():
|
||||
"""Base class for location indexer like loc and iloc
|
||||
"""
|
||||
|
||||
def __init__(self, ray_df):
|
||||
self.df = ray_df
|
||||
self.col_coord_df = ray_df._col_metadata._coord_df
|
||||
self.row_coord_df = ray_df._row_metadata._coord_df
|
||||
self.block_oids = ray_df._block_partitions
|
||||
|
||||
self.is_view = False
|
||||
if isinstance(ray_df, DataFrameView):
|
||||
self.block_oids = ray_df._block_partitions_data
|
||||
self.is_view = True
|
||||
|
||||
def __getitem__(self, row_lookup, col_lookup, ndim):
|
||||
"""
|
||||
Args:
|
||||
row_lookup: A pandas dataframe, a partial view from row_coord_df
|
||||
col_lookup: A pandas dataframe, a partial view from col_coord_df
|
||||
ndim: the dimension of returned data
|
||||
"""
|
||||
if ndim == 2:
|
||||
return self._generate_view(row_lookup, col_lookup)
|
||||
|
||||
extracted = self._retrive_items(row_lookup, col_lookup)
|
||||
if ndim == 1:
|
||||
result = ray.get(_blocks_to_col.remote(*extracted)).squeeze()
|
||||
|
||||
if is_scalar(result):
|
||||
result = pandas.Series(result)
|
||||
|
||||
scaler_axis = row_lookup if len(row_lookup) == 1 else col_lookup
|
||||
series_name = scaler_axis.iloc[0].name
|
||||
result.name = series_name
|
||||
|
||||
index_axis = row_lookup if len(col_lookup) == 1 else col_lookup
|
||||
result.index = index_axis.index
|
||||
|
||||
if ndim == 0:
|
||||
result = ray.get(extracted[0]).squeeze()
|
||||
|
||||
return result
|
||||
|
||||
def _retrive_items(self, row_lookup, col_lookup):
|
||||
"""Given lookup dataframes, return a list of result oids
|
||||
"""
|
||||
result_oids = []
|
||||
|
||||
# We have to copy before we groupby because
|
||||
# https://github.com/pandas-dev/pandas/issues/10043
|
||||
row_groups = row_lookup.copy().groupby('partition')
|
||||
col_groups = col_lookup.copy().groupby('partition')
|
||||
for row_blk, row_data in row_groups:
|
||||
for col_blk, col_data in col_groups:
|
||||
block_oid = self.block_oids[row_blk, col_blk]
|
||||
row_idx = row_data['index_within_partition']
|
||||
col_idx = col_data['index_within_partition']
|
||||
|
||||
result_oid = extractor.remote(block_oid, row_idx, col_idx)
|
||||
result_oids.append(result_oid)
|
||||
return result_oids
|
||||
|
||||
def _generate_view(self, row_lookup, col_lookup):
|
||||
"""Generate a DataFrameView from lookup
|
||||
"""
|
||||
row_lengths = [0] * len(self.df._row_metadata._lengths)
|
||||
for i in row_lookup["partition"]:
|
||||
row_lengths[i] += 1
|
||||
col_lengths = [0] * len(self.df._col_metadata._lengths)
|
||||
for i in col_lookup["partition"]:
|
||||
col_lengths[i] += 1
|
||||
|
||||
row_metadata_view = _IndexMetadata(
|
||||
coord_df_oid=row_lookup, lengths_oid=row_lengths)
|
||||
|
||||
col_metadata_view = _IndexMetadata(
|
||||
coord_df_oid=col_lookup, lengths_oid=col_lengths)
|
||||
|
||||
df_view = DataFrameView(
|
||||
block_partitions=self.block_oids,
|
||||
row_metadata=row_metadata_view,
|
||||
col_metadata=col_metadata_view,
|
||||
index=row_metadata_view.index,
|
||||
columns=col_metadata_view.index)
|
||||
|
||||
return df_view
|
||||
|
||||
def __setitem__(self, row_lookup, col_lookup, item):
|
||||
"""
|
||||
Args:
|
||||
row_lookup: A pandas dataframe, a partial view from row_coord_df
|
||||
col_lookup: A pandas dataframe, a partial view from col_coord_df
|
||||
item: The new item needs to be set. It can be any shape that's
|
||||
broadcastable to the product of the lookup tables.
|
||||
"""
|
||||
to_shape = (len(row_lookup), len(col_lookup))
|
||||
item = self._broadcast_item(item, to_shape)
|
||||
self._write_items(row_lookup, col_lookup, item)
|
||||
|
||||
def _broadcast_item(self, item, to_shape):
|
||||
"""Use numpy to broadcast or reshape item.
|
||||
|
||||
Notes:
|
||||
- Numpy is memory efficent, there shouldn't be performance issue.
|
||||
"""
|
||||
try:
|
||||
item = np.array(item)
|
||||
if np.prod(to_shape) == np.prod(item.shape):
|
||||
return item.reshape(to_shape)
|
||||
else:
|
||||
return np.broadcast_to(item, to_shape)
|
||||
except ValueError:
|
||||
from_shape = np.array(item).shape
|
||||
raise ValueError(
|
||||
"could not broadcast input array from \
|
||||
shape {from_shape} into shape {to_shape}".format(
|
||||
from_shape=from_shape, to_shape=to_shape))
|
||||
|
||||
def _write_items(self, row_lookup, col_lookup, item):
|
||||
"""Perform remote write and replace blocks.
|
||||
"""
|
||||
|
||||
# We have to copy before we groupby because
|
||||
# https://github.com/pandas-dev/pandas/issues/10043
|
||||
row_groups = row_lookup.copy().groupby('partition')
|
||||
col_groups = col_lookup.copy().groupby('partition')
|
||||
|
||||
row_item_index = 0
|
||||
for row_blk, row_data in row_groups:
|
||||
row_len = len(row_data)
|
||||
|
||||
col_item_index = 0
|
||||
for col_blk, col_data in col_groups:
|
||||
col_len = len(col_data)
|
||||
|
||||
block_oid = self.block_oids[row_blk, col_blk]
|
||||
row_idx = row_data['index_within_partition']
|
||||
col_idx = col_data['index_within_partition']
|
||||
|
||||
item_to_write = item[row_item_index:row_item_index + row_len,
|
||||
col_item_index:col_item_index + col_len]
|
||||
|
||||
result_oid = writer.remote(block_oid, row_idx, col_idx,
|
||||
item_to_write)
|
||||
|
||||
if self.is_view:
|
||||
self.df._block_partitions_data[row_blk,
|
||||
col_blk] = result_oid
|
||||
else:
|
||||
self.df._block_partitions[row_blk, col_blk] = result_oid
|
||||
|
||||
col_item_index += col_len
|
||||
row_item_index += row_len
|
||||
|
||||
|
||||
class _Loc_Indexer(_Location_Indexer_Base):
|
||||
"""A indexer for ray_df.loc[] functionality"""
|
||||
|
||||
def __getitem__(self, key):
|
||||
row_loc, col_loc, ndim = _parse_tuple(key)
|
||||
self._handle_enlargement(row_loc, col_loc)
|
||||
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
|
||||
ndim = self._expand_dim(row_lookup, col_lookup, ndim)
|
||||
result = super(_Loc_Indexer, self).__getitem__(row_lookup, col_lookup,
|
||||
ndim)
|
||||
return result
|
||||
|
||||
def __setitem__(self, key, item):
|
||||
row_loc, col_loc, _ = _parse_tuple(key)
|
||||
self._handle_enlargement(row_loc, col_loc)
|
||||
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
|
||||
super(_Loc_Indexer, self).__setitem__(row_lookup, col_lookup,
|
||||
item)
|
||||
|
||||
def _handle_enlargement(self, row_loc, col_loc):
|
||||
"""Handle Enlargement (if there is one).
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
locators = [row_loc, col_loc]
|
||||
coord_dfs = [self.row_coord_df, self.col_coord_df]
|
||||
axis = ['row', 'col']
|
||||
metadata = {'row': self.df._row_metadata, 'col': self.df._col_metadata}
|
||||
|
||||
for loc, coord, axis in zip(locators, coord_dfs, axis):
|
||||
if _is_enlargement(loc, coord):
|
||||
new_meta = self._enlarge_axis(loc, axis=axis)
|
||||
_warn_enlargement()
|
||||
metadata[axis] = new_meta
|
||||
|
||||
self.row_coord_df = metadata['row']._coord_df
|
||||
self.col_coord_df = metadata['col']._coord_df
|
||||
|
||||
def _enlarge_axis(self, locator, axis):
|
||||
"""Add rows/columns to block partitions according to locator.
|
||||
|
||||
Returns:
|
||||
metadata (_IndexMetadata)
|
||||
"""
|
||||
# 1. Prepare variables
|
||||
row_based_bool = axis == 'row'
|
||||
# major == the axis of the locator
|
||||
major_meta = self.df._row_metadata if row_based_bool \
|
||||
else self.df._col_metadata
|
||||
minor_meta = self.df._col_metadata if row_based_bool \
|
||||
else self.df._row_metadata
|
||||
|
||||
# 2. Compute the nan labels and add blocks
|
||||
nan_labels = self._compute_enlarge_labels(locator, major_meta.index)
|
||||
num_nan_labels = len(nan_labels)
|
||||
blk_part_n_row, blk_part_n_col = self.block_oids.shape
|
||||
|
||||
nan_blk_lens = minor_meta._lengths
|
||||
nan_blks = np.array([[
|
||||
_get_nan_block_id(
|
||||
num_nan_labels, n_cols, transpose=not row_based_bool)
|
||||
for n_cols in nan_blk_lens
|
||||
]])
|
||||
nan_blks = nan_blks.T if not row_based_bool else nan_blks
|
||||
|
||||
self.block_oids = np.concatenate(
|
||||
[self.block_oids, nan_blks], axis=0 if row_based_bool else 1)
|
||||
|
||||
# 3. Prepare metadata to return
|
||||
nan_coord_df = pandas.DataFrame(data=[{
|
||||
'': name,
|
||||
'partition': blk_part_n_row if row_based_bool else blk_part_n_col,
|
||||
'index_within_partition': i
|
||||
} for name, i in zip(nan_labels, np.arange(num_nan_labels))
|
||||
]).set_index('')
|
||||
|
||||
coord_df = pandas.concat([major_meta._coord_df, nan_coord_df])
|
||||
coord_df = coord_df.loc[locator] # Re-index that allows duplicates
|
||||
|
||||
lens = major_meta._lengths
|
||||
lens = np.concatenate([lens, np.array([num_nan_labels])])
|
||||
|
||||
metadata_view = _IndexMetadata(coord_df_oid=coord_df, lengths_oid=lens)
|
||||
return metadata_view
|
||||
|
||||
def _compute_enlarge_labels(self, locator, base_index):
|
||||
"""Helper for _enlarge_axis, compute common labels and extra labels.
|
||||
|
||||
Returns:
|
||||
nan_labels: The labels needs to be added
|
||||
"""
|
||||
# base_index_type can be pd.Index or pd.DatetimeIndex
|
||||
# depending on user input and pandas behavior
|
||||
# See issue #2264
|
||||
base_index_type = type(base_index)
|
||||
locator_as_index = base_index_type(locator)
|
||||
|
||||
nan_labels = locator_as_index.difference(base_index)
|
||||
common_labels = locator_as_index.intersection(base_index)
|
||||
|
||||
if len(common_labels) == 0:
|
||||
raise KeyError(
|
||||
'None of [{labels}] are in the [{base_index_name}]'.format(
|
||||
labels=list(locator_as_index), base_index_name=base_index))
|
||||
|
||||
return nan_labels
|
||||
|
||||
def _expand_dim(self, row_lookup, col_lookup, ndim):
|
||||
"""Expand the dimension if necessary.
|
||||
This method is for cases like duplicate labels.
|
||||
"""
|
||||
many_rows = len(row_lookup) > 1
|
||||
many_cols = len(col_lookup) > 1
|
||||
|
||||
if ndim == 0 and (many_rows or many_cols):
|
||||
ndim = 1
|
||||
if ndim == 1 and (many_rows and many_cols):
|
||||
ndim = 2
|
||||
|
||||
return ndim
|
||||
|
||||
def _compute_lookup(self, row_loc, col_loc):
|
||||
# We use reindex for list to avoid duplicates.
|
||||
row_lookup = self.row_coord_df.loc[row_loc]
|
||||
col_lookup = self.col_coord_df.loc[col_loc]
|
||||
return row_lookup, col_lookup
|
||||
|
||||
|
||||
class _iLoc_Indexer(_Location_Indexer_Base):
|
||||
"""A indexer for ray_df.iloc[] functionality"""
|
||||
|
||||
def __getitem__(self, key):
|
||||
row_loc, col_loc, ndim = _parse_tuple(key)
|
||||
|
||||
self._check_dtypes(row_loc)
|
||||
self._check_dtypes(col_loc)
|
||||
|
||||
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
|
||||
result = super(_iLoc_Indexer, self).__getitem__(
|
||||
row_lookup, col_lookup, ndim)
|
||||
return result
|
||||
|
||||
def __setitem__(self, key, item):
|
||||
row_loc, col_loc, _ = _parse_tuple(key)
|
||||
|
||||
self._check_dtypes(row_loc)
|
||||
self._check_dtypes(col_loc)
|
||||
|
||||
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
|
||||
super(_iLoc_Indexer, self).__setitem__(
|
||||
row_lookup, col_lookup, item)
|
||||
|
||||
def _compute_lookup(self, row_loc, col_loc):
|
||||
# We use reindex for list to avoid duplicates.
|
||||
return self.row_coord_df.iloc[row_loc], self.col_coord_df.iloc[col_loc]
|
||||
|
||||
def _check_dtypes(self, locator):
|
||||
is_int = is_integer(locator)
|
||||
is_int_slice = is_integer_slice(locator)
|
||||
is_int_list = is_list_like(locator) and all(map(is_integer, locator))
|
||||
is_bool_arr = is_boolean_array(locator)
|
||||
|
||||
if not any([is_int, is_int_slice, is_int_list, is_bool_arr]):
|
||||
raise ValueError(_ILOC_INT_ONLY_ERROR)
|
||||
|
||||
|
||||
class DataFrameView(DataFrame):
|
||||
"""A subclass of DataFrame where the index can be smaller than blocks.
|
||||
"""
|
||||
|
||||
def __init__(self, block_partitions, row_metadata, col_metadata, index,
|
||||
columns):
|
||||
self._block_partitions = block_partitions
|
||||
self._row_metadata = row_metadata
|
||||
self._col_metadata = col_metadata
|
||||
self.index = index
|
||||
self.columns = columns
|
||||
|
||||
def _get_block_partitions(self):
|
||||
oid_arr = _mask_block_partitions(self._block_partitions_data,
|
||||
self._row_metadata,
|
||||
self._col_metadata)
|
||||
return oid_arr
|
||||
|
||||
def _set_block_partitions(self, new_block_partitions):
|
||||
self._block_partitions_data = new_block_partitions
|
||||
|
||||
_block_partitions = property(_get_block_partitions, _set_block_partitions)
|
||||
@@ -1,524 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from itertools import chain
|
||||
from io import BytesIO
|
||||
import os
|
||||
import re
|
||||
import warnings
|
||||
import pandas
|
||||
|
||||
from pyarrow.parquet import ParquetFile
|
||||
from pandas.io.common import _infer_compression # don't depend on internal API
|
||||
|
||||
|
||||
from .dataframe import ray, DataFrame
|
||||
from . import get_npartitions
|
||||
from .utils import from_pandas
|
||||
|
||||
|
||||
# Parquet
|
||||
def read_parquet(path, engine='auto', columns=None, **kwargs):
|
||||
"""Load a parquet object from the file path, returning a DataFrame.
|
||||
Ray DataFrame only supports pyarrow engine for now.
|
||||
|
||||
Args:
|
||||
path: The filepath of the parquet file.
|
||||
We only support local files for now.
|
||||
engine: Ray only support pyarrow reader.
|
||||
This argument doesn't do anything for now.
|
||||
kwargs: Pass into parquet's read_row_group function.
|
||||
"""
|
||||
pf = ParquetFile(path)
|
||||
|
||||
n_rows = pf.metadata.num_rows
|
||||
chunksize = n_rows // get_npartitions()
|
||||
n_row_groups = pf.metadata.num_row_groups
|
||||
|
||||
idx_regex = re.compile('__index_level_\d+__')
|
||||
columns = [
|
||||
name for name in pf.metadata.schema.names if not idx_regex.match(name)
|
||||
]
|
||||
|
||||
df_from_row_groups = [
|
||||
_read_parquet_row_group.remote(path, columns, i, kwargs)
|
||||
for i in range(n_row_groups)
|
||||
]
|
||||
splited_dfs = ray.get(
|
||||
[_split_df.remote(df, chunksize) for df in df_from_row_groups])
|
||||
df_remotes = list(chain.from_iterable(splited_dfs))
|
||||
|
||||
return DataFrame(row_partitions=df_remotes, columns=columns)
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _read_parquet_row_group(path, columns, row_group_id, kwargs={}):
|
||||
"""Read a parquet row_group given file_path.
|
||||
"""
|
||||
pf = ParquetFile(path)
|
||||
df = pf.read_row_group(row_group_id, columns=columns, **kwargs).to_pandas()
|
||||
return df
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _split_df(pandas_df, chunksize):
|
||||
"""Split a pandas_df into partitions.
|
||||
|
||||
Returns:
|
||||
remote_df_ids ([ObjectID])
|
||||
"""
|
||||
dataframes = []
|
||||
|
||||
while len(pandas_df) > chunksize:
|
||||
t_df = pandas_df[:chunksize]
|
||||
t_df.reset_index(drop=True)
|
||||
top = ray.put(t_df)
|
||||
dataframes.append(top)
|
||||
pandas_df = pandas_df[chunksize:]
|
||||
else:
|
||||
pandas_df = pandas_df.reset_index(drop=True)
|
||||
dataframes.append(ray.put(pandas_df))
|
||||
|
||||
return dataframes
|
||||
|
||||
|
||||
# CSV
|
||||
def _compute_offset(fn, npartitions, ignore_first_line=False):
|
||||
"""
|
||||
Calculate the currect bytes offsets for a csv file.
|
||||
Return a list of (start, end) tuple where the end == \n or EOF.
|
||||
"""
|
||||
total_bytes = os.path.getsize(fn)
|
||||
bio = open(fn, 'rb')
|
||||
if ignore_first_line:
|
||||
start = len(bio.readline())
|
||||
chunksize = (total_bytes - start) // npartitions
|
||||
else:
|
||||
start = 0
|
||||
chunksize = total_bytes // npartitions
|
||||
if chunksize == 0:
|
||||
chunksize = 1
|
||||
|
||||
offsets = []
|
||||
while start < total_bytes:
|
||||
bio.seek(chunksize, 1) # Move forward {chunksize} bytes
|
||||
extend_line = bio.readline() # Move after the next \n
|
||||
total_offset = chunksize + len(extend_line)
|
||||
# The position of the \n we just crossed.
|
||||
new_line_cursor = start + total_offset - 1
|
||||
offsets.append((start, new_line_cursor))
|
||||
start = new_line_cursor + 1
|
||||
|
||||
bio.close()
|
||||
return offsets
|
||||
|
||||
|
||||
def _get_firstline(file_path):
|
||||
bio = open(file_path, 'rb')
|
||||
first = bio.readline()
|
||||
bio.close()
|
||||
return first
|
||||
|
||||
|
||||
def _infer_column(first_line, kwargs={}):
|
||||
return pandas.read_csv(BytesIO(first_line), **kwargs).columns
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _read_csv_with_offset(fn, start, end, kwargs={}, header=b''):
|
||||
kwargs["quoting"] = int(kwargs["quoting"]) # See issue #2078
|
||||
|
||||
bio = open(fn, 'rb')
|
||||
bio.seek(start)
|
||||
to_read = header + bio.read(end - start)
|
||||
bio.close()
|
||||
pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs)
|
||||
index = pandas_df.index
|
||||
# Partitions must have RangeIndex
|
||||
pandas_df.index = pandas.RangeIndex(0, len(pandas_df))
|
||||
return pandas_df, index
|
||||
|
||||
|
||||
@ray.remote
|
||||
def get_index(*partition_indices):
|
||||
return partition_indices[0].append(partition_indices[1:])
|
||||
|
||||
|
||||
def read_csv(filepath_or_buffer,
|
||||
sep=',',
|
||||
delimiter=None,
|
||||
header='infer',
|
||||
names=None,
|
||||
index_col=None,
|
||||
usecols=None,
|
||||
squeeze=False,
|
||||
prefix=None,
|
||||
mangle_dupe_cols=True,
|
||||
dtype=None,
|
||||
engine=None,
|
||||
converters=None,
|
||||
true_values=None,
|
||||
false_values=None,
|
||||
skipinitialspace=False,
|
||||
skiprows=None,
|
||||
nrows=None,
|
||||
na_values=None,
|
||||
keep_default_na=True,
|
||||
na_filter=True,
|
||||
verbose=False,
|
||||
skip_blank_lines=True,
|
||||
parse_dates=False,
|
||||
infer_datetime_format=False,
|
||||
keep_date_col=False,
|
||||
date_parser=None,
|
||||
dayfirst=False,
|
||||
iterator=False,
|
||||
chunksize=None,
|
||||
compression='infer',
|
||||
thousands=None,
|
||||
decimal=b'.',
|
||||
lineterminator=None,
|
||||
quotechar='"',
|
||||
quoting=0,
|
||||
escapechar=None,
|
||||
comment=None,
|
||||
encoding=None,
|
||||
dialect=None,
|
||||
tupleize_cols=None,
|
||||
error_bad_lines=True,
|
||||
warn_bad_lines=True,
|
||||
skipfooter=0,
|
||||
skip_footer=0,
|
||||
doublequote=True,
|
||||
delim_whitespace=False,
|
||||
as_recarray=None,
|
||||
compact_ints=None,
|
||||
use_unsigned=None,
|
||||
low_memory=True,
|
||||
buffer_lines=None,
|
||||
memory_map=False,
|
||||
float_precision=None):
|
||||
"""Read csv file from local disk.
|
||||
|
||||
Args:
|
||||
filepath:
|
||||
The filepath of the csv file.
|
||||
We only support local files for now.
|
||||
kwargs: Keyword arguments in pandas::from_csv
|
||||
"""
|
||||
|
||||
kwargs = {
|
||||
'sep': sep,
|
||||
'delimiter': delimiter,
|
||||
'header': header,
|
||||
'names': names,
|
||||
'index_col': index_col,
|
||||
'usecols': usecols,
|
||||
'squeeze': squeeze,
|
||||
'prefix': prefix,
|
||||
'mangle_dupe_cols': mangle_dupe_cols,
|
||||
'dtype': dtype,
|
||||
'engine': engine,
|
||||
'converters': converters,
|
||||
'true_values': true_values,
|
||||
'false_values': false_values,
|
||||
'skipinitialspace': skipinitialspace,
|
||||
'skiprows': skiprows,
|
||||
'nrows': nrows,
|
||||
'na_values': na_values,
|
||||
'keep_default_na': keep_default_na,
|
||||
'na_filter': na_filter,
|
||||
'verbose': verbose,
|
||||
'skip_blank_lines': skip_blank_lines,
|
||||
'parse_dates': parse_dates,
|
||||
'infer_datetime_format': infer_datetime_format,
|
||||
'keep_date_col': keep_date_col,
|
||||
'date_parser': date_parser,
|
||||
'dayfirst': dayfirst,
|
||||
'iterator': iterator,
|
||||
'chunksize': chunksize,
|
||||
'compression': compression,
|
||||
'thousands': thousands,
|
||||
'decimal': decimal,
|
||||
'lineterminator': lineterminator,
|
||||
'quotechar': quotechar,
|
||||
'quoting': quoting,
|
||||
'escapechar': escapechar,
|
||||
'comment': comment,
|
||||
'encoding': encoding,
|
||||
'dialect': dialect,
|
||||
'tupleize_cols': tupleize_cols,
|
||||
'error_bad_lines': error_bad_lines,
|
||||
'warn_bad_lines': warn_bad_lines,
|
||||
'skipfooter': skipfooter,
|
||||
'skip_footer': skip_footer,
|
||||
'doublequote': doublequote,
|
||||
'delim_whitespace': delim_whitespace,
|
||||
'as_recarray': as_recarray,
|
||||
'compact_ints': compact_ints,
|
||||
'use_unsigned': use_unsigned,
|
||||
'low_memory': low_memory,
|
||||
'buffer_lines': buffer_lines,
|
||||
'memory_map': memory_map,
|
||||
'float_precision': float_precision,
|
||||
}
|
||||
|
||||
# Default to Pandas read_csv for non-serializable objects
|
||||
if not isinstance(filepath_or_buffer, str) or \
|
||||
_infer_compression(filepath_or_buffer, compression) is not None:
|
||||
|
||||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
pandas_obj = pandas.read_csv(filepath_or_buffer, **kwargs)
|
||||
if isinstance(pandas_obj, pandas.DataFrame):
|
||||
return from_pandas(pandas_obj, get_npartitions())
|
||||
|
||||
return pandas_obj
|
||||
|
||||
filepath = filepath_or_buffer
|
||||
|
||||
# TODO: handle case where header is a list of lines
|
||||
first_line = _get_firstline(filepath)
|
||||
columns = _infer_column(first_line, kwargs=kwargs)
|
||||
if header is None or (header == "infer" and names is not None):
|
||||
first_line = b""
|
||||
ignore_first_line = False
|
||||
else:
|
||||
ignore_first_line = True
|
||||
|
||||
offsets = _compute_offset(filepath, get_npartitions(),
|
||||
ignore_first_line=ignore_first_line)
|
||||
|
||||
# Serialize objects to speed up later use in remote tasks
|
||||
first_line_id = ray.put(first_line)
|
||||
kwargs_id = ray.put(kwargs)
|
||||
|
||||
df_obj_ids = []
|
||||
index_obj_ids = []
|
||||
for start, end in offsets:
|
||||
if start != 0:
|
||||
df, index = _read_csv_with_offset._submit(
|
||||
args=(filepath, start, end, kwargs_id, first_line_id),
|
||||
num_return_vals=2)
|
||||
else:
|
||||
df, index = _read_csv_with_offset._submit(
|
||||
args=(filepath, start, end, kwargs_id),
|
||||
num_return_vals=2)
|
||||
df_obj_ids.append(df)
|
||||
index_obj_ids.append(index)
|
||||
|
||||
index = get_index.remote(*index_obj_ids) if index_col is not None else None
|
||||
|
||||
return DataFrame(row_partitions=df_obj_ids, columns=columns, index=index)
|
||||
|
||||
|
||||
def read_json(path_or_buf=None,
|
||||
orient=None,
|
||||
typ='frame',
|
||||
dtype=True,
|
||||
convert_axes=True,
|
||||
convert_dates=True,
|
||||
keep_default_dates=True,
|
||||
numpy=False,
|
||||
precise_float=False,
|
||||
date_unit=None,
|
||||
encoding=None,
|
||||
lines=False,
|
||||
chunksize=None,
|
||||
compression='infer'):
|
||||
|
||||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pandas.read_json(path_or_buf, orient, typ, dtype,
|
||||
convert_axes, convert_dates,
|
||||
keep_default_dates, numpy, precise_float,
|
||||
date_unit, encoding, lines, chunksize,
|
||||
compression)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
||||
|
||||
def read_html(io,
|
||||
match='.+',
|
||||
flavor=None,
|
||||
header=None,
|
||||
index_col=None,
|
||||
skiprows=None,
|
||||
attrs=None,
|
||||
parse_dates=False,
|
||||
tupleize_cols=None,
|
||||
thousands=',',
|
||||
encoding=None,
|
||||
decimal='.',
|
||||
converters=None,
|
||||
na_values=None,
|
||||
keep_default_na=True):
|
||||
|
||||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pandas.read_html(io, match, flavor, header, index_col,
|
||||
skiprows, attrs, parse_dates, tupleize_cols,
|
||||
thousands, encoding, decimal, converters,
|
||||
na_values, keep_default_na)
|
||||
ray_frame = from_pandas(port_frame[0], get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
||||
|
||||
def read_clipboard(sep=r'\s+'):
|
||||
|
||||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pandas.read_clipboard(sep)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
||||
|
||||
def read_excel(io,
|
||||
sheet_name=0,
|
||||
header=0,
|
||||
skiprows=None,
|
||||
skip_footer=0,
|
||||
index_col=None,
|
||||
names=None,
|
||||
usecols=None,
|
||||
parse_dates=False,
|
||||
date_parser=None,
|
||||
na_values=None,
|
||||
thousands=None,
|
||||
convert_float=True,
|
||||
converters=None,
|
||||
dtype=None,
|
||||
true_values=None,
|
||||
false_values=None,
|
||||
engine=None,
|
||||
squeeze=False):
|
||||
|
||||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pandas.read_excel(io, sheet_name, header, skiprows,
|
||||
skip_footer, index_col, names, usecols,
|
||||
parse_dates, date_parser, na_values,
|
||||
thousands, convert_float, converters, dtype,
|
||||
true_values, false_values, engine, squeeze)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
||||
|
||||
def read_hdf(path_or_buf,
|
||||
key=None,
|
||||
mode='r'):
|
||||
|
||||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pandas.read_hdf(path_or_buf, key, mode)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
||||
|
||||
def read_feather(path,
|
||||
nthreads=1):
|
||||
|
||||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pandas.read_feather(path)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
||||
|
||||
def read_msgpack(path_or_buf,
|
||||
encoding='utf-8',
|
||||
iterator=False):
|
||||
|
||||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pandas.read_msgpack(path_or_buf, encoding, iterator)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
||||
|
||||
def read_stata(filepath_or_buffer,
|
||||
convert_dates=True,
|
||||
convert_categoricals=True,
|
||||
encoding=None,
|
||||
index_col=None,
|
||||
convert_missing=False,
|
||||
preserve_dtypes=True,
|
||||
columns=None,
|
||||
order_categoricals=True,
|
||||
chunksize=None,
|
||||
iterator=False):
|
||||
|
||||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pandas.read_stata(filepath_or_buffer, convert_dates,
|
||||
convert_categoricals, encoding, index_col,
|
||||
convert_missing, preserve_dtypes, columns,
|
||||
order_categoricals, chunksize, iterator)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
||||
|
||||
def read_sas(filepath_or_buffer,
|
||||
format=None,
|
||||
index=None,
|
||||
encoding=None,
|
||||
chunksize=None,
|
||||
iterator=False):
|
||||
|
||||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pandas.read_sas(filepath_or_buffer, format, index, encoding,
|
||||
chunksize, iterator)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
||||
|
||||
def read_pickle(path,
|
||||
compression='infer'):
|
||||
|
||||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pandas.read_pickle(path, compression)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
|
||||
|
||||
def read_sql(sql,
|
||||
con,
|
||||
index_col=None,
|
||||
coerce_float=True,
|
||||
params=None,
|
||||
parse_dates=None,
|
||||
columns=None,
|
||||
chunksize=None):
|
||||
|
||||
warnings.warn("Defaulting to Pandas implementation",
|
||||
PendingDeprecationWarning)
|
||||
|
||||
port_frame = pandas.read_sql(sql, con, index_col, coerce_float, params,
|
||||
parse_dates, columns, chunksize)
|
||||
ray_frame = from_pandas(port_frame, get_npartitions())
|
||||
|
||||
return ray_frame
|
||||
@@ -1,29 +0,0 @@
|
||||
from collections import Iterator
|
||||
|
||||
|
||||
class PartitionIterator(Iterator):
|
||||
def __init__(self, partitions, func):
|
||||
"""PartitionIterator class to define a generator on partitioned data
|
||||
|
||||
Args:
|
||||
partitions ([ObjectID]): Partitions to iterate over
|
||||
func (callable): The function to get inner iterables from
|
||||
each partition
|
||||
"""
|
||||
self.partitions = iter(partitions)
|
||||
self.func = func
|
||||
self.iter_cache = iter([])
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
return self.next()
|
||||
|
||||
def next(self):
|
||||
try:
|
||||
return next(self.iter_cache)
|
||||
except StopIteration:
|
||||
next_partition = next(self.partitions)
|
||||
self.iter_cache = self.func(next_partition)
|
||||
return self.next()
|
||||
@@ -1,94 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import inspect
|
||||
|
||||
|
||||
def code_gen(pandas_obj, ray_obj, path):
|
||||
"""Generate code skeleton for methods not in Ray
|
||||
|
||||
Args:
|
||||
pandas_obj: The pandas object to generate code from.
|
||||
ray_obj: The ray object to diff against.
|
||||
path: Path to output the file to.
|
||||
"""
|
||||
|
||||
with open(path, "w") as outfile:
|
||||
funcs = pandas_ray_diff(pandas_obj, ray_obj)
|
||||
|
||||
for func in funcs:
|
||||
if func[0] == "_" and func[1] != "_":
|
||||
continue
|
||||
if "attr" in func:
|
||||
# let's not mess with these
|
||||
continue
|
||||
try:
|
||||
outfile.write(
|
||||
"\ndef " + func +
|
||||
str(inspect.signature(getattr(pandas_obj, func))) + ":\n")
|
||||
|
||||
except TypeError:
|
||||
outfile.write("\n@property")
|
||||
outfile.write("\ndef " + func + "(self):\n")
|
||||
except ValueError:
|
||||
continue
|
||||
outfile.write(
|
||||
" raise NotImplementedError(\"Not Yet implemented.\")\n")
|
||||
|
||||
|
||||
def code_gen_test(ray_obj, path, name):
|
||||
"""Generate tests for methods in Ray."""
|
||||
|
||||
with open(path, "a") as outfile:
|
||||
funcs = dir(ray_obj)
|
||||
|
||||
for func in funcs:
|
||||
if func[0] == "_" and func[1] != "_":
|
||||
continue
|
||||
|
||||
outfile.write("\n\ndef test_" + func + "():\n")
|
||||
outfile.write(
|
||||
" ray_" + name + " = create_test_" + name + "()\n\n" +
|
||||
" with pytest.raises(NotImplementedError):\n" +
|
||||
" ray_" + name + "." + func)
|
||||
try:
|
||||
first = True
|
||||
param_num = \
|
||||
len(inspect.signature(getattr(ray_obj, func)).parameters)
|
||||
if param_num > 1:
|
||||
param_num -= 1
|
||||
|
||||
for _ in range(param_num):
|
||||
if first:
|
||||
outfile.write("(None")
|
||||
first = False
|
||||
else:
|
||||
outfile.write(", None")
|
||||
except (TypeError, ValueError, NotImplementedError):
|
||||
outfile.write("\n")
|
||||
continue
|
||||
|
||||
if first:
|
||||
outfile.write("(")
|
||||
outfile.write(")\n")
|
||||
|
||||
|
||||
def pandas_ray_diff(pandas_obj, ray_obj):
|
||||
"""Gets the diff of the methods in the Pandas and Ray objects.
|
||||
|
||||
Args:
|
||||
pandas_obj: The Pandas object to diff.
|
||||
ray_obj: The Ray object to diff.
|
||||
|
||||
Returns:
|
||||
A list of method names that are different between the two.
|
||||
"""
|
||||
pandas_funcs = dir(pandas_obj)
|
||||
ray_funcs = dir(ray_obj)
|
||||
|
||||
pandas_funcs = set(filter(lambda f: f[0] != "_" or f[1] == "_",
|
||||
pandas_funcs))
|
||||
|
||||
diff = [x for x in pandas_funcs if x not in set(ray_funcs)]
|
||||
return diff
|
||||
@@ -1,125 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import ray
|
||||
import pandas
|
||||
import numpy as np
|
||||
|
||||
from pandas import compat
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from itertools import cycle
|
||||
|
||||
from .dataframe import DataFrame
|
||||
from .utils import _deploy_func
|
||||
|
||||
|
||||
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
|
||||
columns=None, sparse=False, drop_first=False):
|
||||
"""Convert categorical variable into indicator variables.
|
||||
|
||||
Args:
|
||||
data (array-like, Series, or DataFrame): data to encode.
|
||||
prefix (string, [string]): Prefix to apply to each encoded column
|
||||
label.
|
||||
prefix_sep (string, [string]): Separator between prefix and value.
|
||||
dummy_na (bool): Add a column to indicate NaNs.
|
||||
columns: Which columns to encode.
|
||||
sparse (bool): Not Implemented: If True, returns SparseDataFrame.
|
||||
drop_first (bool): Whether to remove the first level of encoded data.
|
||||
|
||||
Returns:
|
||||
DataFrame or one-hot encoded data.
|
||||
"""
|
||||
if not isinstance(data, DataFrame):
|
||||
return pandas.get_dummies(data, prefix=prefix, prefix_sep=prefix_sep,
|
||||
dummy_na=dummy_na, columns=columns,
|
||||
sparse=sparse, drop_first=drop_first)
|
||||
|
||||
if sparse:
|
||||
raise NotImplementedError(
|
||||
"SparseDataFrame is not implemented. "
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
if columns is None:
|
||||
columns_to_encode = data.dtypes.isin([np.dtype("O"), 'category'])
|
||||
columns_to_encode = data.columns[columns_to_encode]
|
||||
else:
|
||||
columns_to_encode = columns
|
||||
|
||||
def check_len(item, name):
|
||||
len_msg = ("Length of '{name}' ({len_item}) did not match the "
|
||||
"length of the columns being encoded ({len_enc}).")
|
||||
|
||||
if is_list_like(item):
|
||||
if not len(item) == len(columns_to_encode):
|
||||
len_msg = len_msg.format(name=name, len_item=len(item),
|
||||
len_enc=len(columns_to_encode))
|
||||
raise ValueError(len_msg)
|
||||
|
||||
check_len(prefix, 'prefix')
|
||||
check_len(prefix_sep, 'prefix_sep')
|
||||
if isinstance(prefix, compat.string_types):
|
||||
prefix = cycle([prefix])
|
||||
prefix = [next(prefix) for i in range(len(columns_to_encode))]
|
||||
if isinstance(prefix, dict):
|
||||
prefix = [prefix[col] for col in columns_to_encode]
|
||||
|
||||
if prefix is None:
|
||||
prefix = columns_to_encode
|
||||
|
||||
# validate separators
|
||||
if isinstance(prefix_sep, compat.string_types):
|
||||
prefix_sep = cycle([prefix_sep])
|
||||
prefix_sep = [next(prefix_sep) for i in range(len(columns_to_encode))]
|
||||
elif isinstance(prefix_sep, dict):
|
||||
prefix_sep = [prefix_sep[col] for col in columns_to_encode]
|
||||
|
||||
if set(columns_to_encode) == set(data.columns):
|
||||
with_dummies = []
|
||||
dropped_columns = pandas.Index()
|
||||
else:
|
||||
with_dummies = data.drop(columns_to_encode, axis=1)._col_partitions
|
||||
dropped_columns = data.columns.drop(columns_to_encode)
|
||||
|
||||
def get_dummies_remote(df, to_drop, prefix, prefix_sep):
|
||||
df = df.drop(to_drop, axis=1)
|
||||
|
||||
if df.size == 0:
|
||||
return df, df.columns
|
||||
|
||||
df = pandas.get_dummies(df, prefix=prefix, prefix_sep=prefix_sep,
|
||||
dummy_na=dummy_na, columns=None, sparse=sparse,
|
||||
drop_first=drop_first)
|
||||
columns = df.columns
|
||||
df.columns = pandas.RangeIndex(0, len(df.columns))
|
||||
return df, columns
|
||||
|
||||
total = 0
|
||||
columns = []
|
||||
for i, part in enumerate(data._col_partitions):
|
||||
col_index = data._col_metadata.partition_series(i)
|
||||
|
||||
# TODO(kunalgosar): Handle the case of duplicate columns here
|
||||
to_encode = col_index.index.isin(columns_to_encode)
|
||||
|
||||
to_encode = col_index[to_encode]
|
||||
to_drop = col_index.drop(to_encode.index)
|
||||
|
||||
result = _deploy_func._submit(
|
||||
args=(get_dummies_remote, part, to_drop,
|
||||
prefix[total:total + len(to_encode)],
|
||||
prefix_sep[total:total + len(to_encode)]),
|
||||
num_return_vals=2)
|
||||
|
||||
with_dummies.append(result[0])
|
||||
columns.append(result[1])
|
||||
total += len(to_encode)
|
||||
|
||||
columns = ray.get(columns)
|
||||
dropped_columns = dropped_columns.append(columns)
|
||||
|
||||
return DataFrame(col_partitions=with_dummies,
|
||||
columns=dropped_columns,
|
||||
index=data.index)
|
||||
@@ -1,970 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pandas
|
||||
|
||||
from .utils import _inherit_docstrings
|
||||
|
||||
|
||||
def na_op():
|
||||
"""Pandas uses a similar function to handle na values.
|
||||
"""
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
|
||||
@_inherit_docstrings(pandas.Series, excluded=[pandas.Series,
|
||||
pandas.Series.__init__])
|
||||
class Series(object):
|
||||
|
||||
def __init__(self, series_oids):
|
||||
"""Constructor for a Series object.
|
||||
|
||||
Args:
|
||||
series_oids ([ObjectID]): The list of remote Series objects.
|
||||
"""
|
||||
self.series_oids = series_oids
|
||||
|
||||
@property
|
||||
def T(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __abs__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __add__(self, right, name='__add__', na_op=na_op):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __and__(self, other):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __array__(self, result=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __array_prepare__(self, result, context=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def __array_priority__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __array_wrap__(self, result, context=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __bool__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __bytes__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __class__(self, data=None, index=None, dtype=None, name=None,
|
||||
copy=False, fastpath=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __contains__(self, key):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __copy__(self, deep=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __deepcopy__(self, memo=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __delitem__(self, key):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __dir__(self):
|
||||
return list(type(self).__dict__.keys())
|
||||
|
||||
def __div__(self, right, name='__truediv__', na_op=na_op):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __divmod__(self, right, name='__divmod__', na_op=na_op):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def __doc__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __eq__(self, other, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __finalize__(self, other, method=None, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __float__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __floordiv__(self, right, name='__floordiv__', na_op=na_op):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __ge__(self, other, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __getitem__(self, key):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __getstate__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __gt__(self, other, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __iadd__(self, other):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __imul__(self, other):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __int__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __invert__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __ipow__(self, other):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __isub__(self, other):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __iter__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __itruediv__(self, other):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __le__(self, other, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __len__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __long__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __lt__(self, other, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __mod__(self, right, name='__mod__', na_op=na_op):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __mul__(self, right, name='__mul__', na_op=na_op):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __ne__(self, other, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __neg__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __nonzero__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __or__(self, other):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __pow__(self, right, name='__pow__', na_op=na_op):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __repr__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __round__(self, decimals=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __setstate__(self, state):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __sizeof__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __str__(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __sub__(self, right, name='__sub__', na_op=na_op):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __truediv__(self, right, name='__truediv__', na_op=na_op):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def __xor__(self, other):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def abs(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def add(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def add_prefix(self, prefix):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def add_suffix(self, suffix):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def agg(self, func, axis=0, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def aggregate(self, func, axis=0, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def align(self, other, join='outer', axis=None, level=None, copy=True,
|
||||
fill_value=None, method=None, limit=None, fill_axis=0,
|
||||
broadcast_axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def all(self, axis=None, bool_only=None, skipna=None, level=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def any(self, axis=None, bool_only=None, skipna=None, level=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def append(self, to_append, ignore_index=False, verify_integrity=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def apply(self, func, convert_dtype=True, args=(), **kwds):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def argmax(self, axis=None, skipna=True, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def argmin(self, axis=None, skipna=True, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def argsort(self, axis=0, kind='quicksort', order=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def as_blocks(self, copy=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def as_matrix(self, columns=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def asfreq(self, freq, method=None, how=None, normalize=False,
|
||||
fill_value=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def asof(self, where, subset=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def astype(self, dtype, copy=True, errors='raise', **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def at(self, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def at_time(self, time, asof=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def autocorr(self, lag=1):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def between(self, left, right, inclusive=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def between_time(self, start_time, end_time, include_start=True,
|
||||
include_end=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def bool(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def clip(self, lower=None, upper=None, axis=None, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def clip_lower(self, threshold, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def clip_upper(self, threshold, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def combine(self, other, func, fill_value=np.nan):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def combine_first(self, other):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def compound(self, axis=None, skipna=None, level=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def compress(self, condition, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def consolidate(self, inplace=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def convert_objects(self, convert_dates=True, convert_numeric=False,
|
||||
convert_timedeltas=True, copy=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def copy(self, deep=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def corr(self, other, method='pearson', min_periods=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def count(self, level=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def cov(self, other, min_periods=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def cummax(self, axis=None, skipna=True, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def cummin(self, axis=None, skipna=True, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def cumprod(self, axis=None, skipna=True, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def cumsum(self, axis=None, skipna=True, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def describe(self, percentiles=None, include=None, exclude=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def diff(self, periods=1):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def div(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def divide(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def dot(self, other):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def drop_duplicates(self, keep='first', inplace=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def dropna(self, axis=0, inplace=False, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def duplicated(self, keep='first'):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def eq(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def equals(self, other):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def ewm(self, com=None, span=None, halflife=None, alpha=None,
|
||||
min_periods=0, freq=None, adjust=True, ignore_na=False, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def expanding(self, min_periods=1, freq=None, center=False, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def factorize(self, sort=False, na_sentinel=-1):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def ffill(self, axis=None, inplace=False, limit=None, downcast=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def fillna(self, value=None, method=None, axis=None, inplace=False,
|
||||
limit=None, downcast=None, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def filter(self, items=None, like=None, regex=None, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def first(self, offset):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def first_valid_index(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def floordiv(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def from_array(self, arr, index=None, name=None, dtype=None, copy=False,
|
||||
fastpath=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def from_csv(self, path, sep=',', parse_dates=True, header=None,
|
||||
index_col=0, encoding=None, infer_datetime_format=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def ge(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def get(self, key, default=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def get_dtype_counts(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def get_ftype_counts(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def get_value(self, label, takeable=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def get_values(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
|
||||
group_keys=True, squeeze=False, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def gt(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def head(self, n=5):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def hist(self, by=None, ax=None, grid=True, xlabelsize=None, xrot=None,
|
||||
ylabelsize=None, yrot=None, figsize=None, bins=10, **kwds):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def iat(self, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def idxmax(self, axis=None, skipna=True, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def idxmin(self, axis=None, skipna=True, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def iloc(self, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
|
||||
limit_direction='forward', downcast=None, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def isin(self, values):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def isnull(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def item(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def items(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def iteritems(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def ix(self, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def keys(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def kurt(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def last(self, offset):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def last_valid_index(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def le(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def loc(self, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def lt(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def mad(self, axis=None, skipna=None, level=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def map(self, arg, na_action=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None,
|
||||
try_cast=False, raise_on_error=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def max(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def mean(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def median(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def memory_usage(self, index=True, deep=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def min(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def mod(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def mode(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def mul(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def multiply(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def ne(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def nlargest(self, n=5, keep='first'):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def nonzero(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def notnull(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def nsmallest(self, n=5, keep='first'):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def nunique(self, dropna=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def pipe(self, func, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def plot(self, kind='line', ax=None, figsize=None, use_index=True,
|
||||
title=None, grid=None, legend=False, style=None, logx=False,
|
||||
logy=False, loglog=False, xticks=None, yticks=None, xlim=None,
|
||||
ylim=None, rot=None, fontsize=None, colormap=None, table=False,
|
||||
yerr=None, xerr=None, label=None, secondary_y=False, **kwds):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def pop(self, item):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def pow(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def prod(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def product(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def ptp(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def put(self, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def quantile(self, q=0.5, interpolation='linear'):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def radd(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def rank(self, axis=0, method='average', numeric_only=None,
|
||||
na_option='keep', ascending=True, pct=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def ravel(self, order='C'):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def rdiv(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def reindex(self, index=None, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def reindex_axis(self, labels, axis=0, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def reindex_like(self, other, method=None, copy=True, limit=None,
|
||||
tolerance=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def rename(self, index=None, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def rename_axis(self, mapper, axis=0, copy=True, inplace=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def reorder_levels(self, order):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def repeat(self, repeats, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def replace(self, to_replace=None, value=None, inplace=False, limit=None,
|
||||
regex=False, method='pad', axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def resample(self, rule, how=None, axis=0, fill_method=None, closed=None,
|
||||
label=None, convention='start', kind=None, loffset=None,
|
||||
limit=None, base=0, on=None, level=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def reset_index(self, level=None, drop=False, name=None, inplace=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def reshape(self, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def rfloordiv(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def rmod(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def rmul(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def rolling(self, window, min_periods=None, freq=None, center=False,
|
||||
win_type=None, on=None, axis=0, closed=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def round(self, decimals=0, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def rpow(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def rsub(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def rtruediv(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def sample(self, n=None, frac=None, replace=False, weights=None,
|
||||
random_state=None, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def searchsorted(self, value, side='left', sorter=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def select(self, crit, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def sem(self, axis=None, skipna=None, level=None, ddof=1,
|
||||
numeric_only=None, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def set_axis(self, axis, labels):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def set_value(self, label, value, takeable=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def shift(self, periods=1, freq=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def skew(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def slice_shift(self, periods=1, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
|
||||
kind='quicksort', na_position='last', sort_remaining=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def sort_values(self, axis=0, ascending=True, inplace=False,
|
||||
kind='quicksort', na_position='last'):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def sortlevel(self, level=0, ascending=True, sort_remaining=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def squeeze(self, axis=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def std(self, axis=None, skipna=None, level=None, ddof=1,
|
||||
numeric_only=None, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def sub(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def subtract(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def sum(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||||
**kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def swapaxes(self, axis1, axis2, copy=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def swaplevel(self, i=-2, j=-1, copy=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def tail(self, n=5):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_clipboard(self, excel=None, sep=None, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_csv(self, path=None, index=True, sep=',', na_rep='',
|
||||
float_format=None, header=False, index_label=None, mode='w',
|
||||
encoding=None, date_format=None, decimal='.'):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_dense(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_dict(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
|
||||
float_format=None, columns=None, header=True, index=True,
|
||||
index_label=None, startrow=0, startcol=0, engine=None,
|
||||
merge_cells=True, encoding=None, inf_rep='inf',
|
||||
verbose=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_frame(self, name=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_hdf(self, path_or_buf, key, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_json(self, path_or_buf=None, orient=None, date_format=None,
|
||||
double_precision=10, force_ascii=True, date_unit='ms',
|
||||
default_handler=None, lines=False):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_latex(self, buf=None, columns=None, col_space=None, header=True,
|
||||
index=True, na_rep='NaN', formatters=None, float_format=None,
|
||||
sparsify=None, index_names=True, bold_rows=False,
|
||||
column_format=None, longtable=None, escape=None,
|
||||
encoding=None, decimal='.', multicolumn=None,
|
||||
multicolumn_format=None, multirow=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_period(self, freq=None, copy=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_pickle(self, path, compression='infer'):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_sparse(self, kind='block', fill_value=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
|
||||
index=True, index_label=None, chunksize=None, dtype=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_string(self, buf=None, na_rep='NaN', float_format=None,
|
||||
header=True, index=True, length=False, dtype=False,
|
||||
name=False, max_rows=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_timestamp(self, freq=None, how='start', copy=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def to_xarray(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def tolist(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def transform(self, func, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def transpose(self, *args, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def truediv(self, other, level=None, fill_value=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def truncate(self, before=None, after=None, axis=None, copy=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def tshift(self, periods=1, freq=None, axis=0):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def tz_convert(self, tz, axis=0, level=None, copy=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def tz_localize(self, tz, axis=0, level=None, copy=True,
|
||||
ambiguous='raise'):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def unique(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def unstack(self, level=-1, fill_value=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def upandasate(self, other):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def valid(self, inplace=False, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def value_counts(self, normalize=False, sort=True, ascending=False,
|
||||
bins=None, dropna=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def var(self, axis=None, skipna=None, level=None, ddof=1,
|
||||
numeric_only=None, **kwargs):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def view(self, dtype=None):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
|
||||
try_cast=False, raise_on_error=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
def xs(key, axis=0, level=None, drop_level=True):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def asobject(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def axes(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def base(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def blocks(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def data(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def dtypes(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def empty(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def flags(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def ftype(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def ftypes(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def hasnans(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def imag(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def index(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def is_copy(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def is_monotonic(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def is_monotonic_decreasing(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def is_monotonic_increasing(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def is_unique(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def itemsize(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def nbytes(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def ndim(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def real(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def strides(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
@property
|
||||
def values(self):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
@@ -1,127 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
import pandas
|
||||
import ray.dataframe as pd
|
||||
from ray.dataframe.utils import (
|
||||
to_pandas,
|
||||
from_pandas
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_df_equals_pandas(ray_df, pandas_df):
|
||||
return to_pandas(ray_df).sort_index().equals(pandas_df.sort_index())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def generate_dfs():
|
||||
df = pandas.DataFrame({'col1': [0, 1, 2, 3],
|
||||
'col2': [4, 5, 6, 7],
|
||||
'col3': [8, 9, 10, 11],
|
||||
'col4': [12, 13, 14, 15],
|
||||
'col5': [0, 0, 0, 0]})
|
||||
|
||||
df2 = pandas.DataFrame({'col1': [0, 1, 2, 3],
|
||||
'col2': [4, 5, 6, 7],
|
||||
'col3': [8, 9, 10, 11],
|
||||
'col6': [12, 13, 14, 15],
|
||||
'col7': [0, 0, 0, 0]})
|
||||
return df, df2
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def generate_none_dfs():
|
||||
df = pandas.DataFrame({'col1': [0, 1, 2, 3],
|
||||
'col2': [4, 5, None, 7],
|
||||
'col3': [8, 9, 10, 11],
|
||||
'col4': [12, 13, 14, 15],
|
||||
'col5': [None, None, None, None]})
|
||||
|
||||
df2 = pandas.DataFrame({'col1': [0, 1, 2, 3],
|
||||
'col2': [4, 5, 6, 7],
|
||||
'col3': [8, 9, 10, 11],
|
||||
'col6': [12, 13, 14, 15],
|
||||
'col7': [0, 0, 0, 0]})
|
||||
return df, df2
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_df_concat():
|
||||
df, df2 = generate_dfs()
|
||||
|
||||
assert(ray_df_equals_pandas(pd.concat([df, df2]),
|
||||
pandas.concat([df, df2])))
|
||||
|
||||
|
||||
def test_ray_concat():
|
||||
df, df2 = generate_dfs()
|
||||
ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
|
||||
|
||||
assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2]),
|
||||
pandas.concat([df, df2]))
|
||||
|
||||
|
||||
def test_ray_concat_on_index():
|
||||
df, df2 = generate_dfs()
|
||||
ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
|
||||
|
||||
assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='index'),
|
||||
pandas.concat([df, df2], axis='index'))
|
||||
|
||||
assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='rows'),
|
||||
pandas.concat([df, df2], axis='rows'))
|
||||
|
||||
assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=0),
|
||||
pandas.concat([df, df2], axis=0))
|
||||
|
||||
|
||||
def test_ray_concat_on_column():
|
||||
df, df2 = generate_dfs()
|
||||
ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
|
||||
|
||||
assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=1),
|
||||
pandas.concat([df, df2], axis=1))
|
||||
|
||||
assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis="columns"),
|
||||
pandas.concat([df, df2], axis="columns"))
|
||||
|
||||
|
||||
def test_invalid_axis_errors():
|
||||
df, df2 = generate_dfs()
|
||||
ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
pd.concat([ray_df, ray_df2], axis=2)
|
||||
|
||||
|
||||
def test_mixed_concat():
|
||||
df, df2 = generate_dfs()
|
||||
df3 = df.copy()
|
||||
|
||||
mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]
|
||||
|
||||
assert(ray_df_equals_pandas(pd.concat(mixed_dfs),
|
||||
pandas.concat([df, df2, df3])))
|
||||
|
||||
|
||||
def test_mixed_inner_concat():
|
||||
df, df2 = generate_dfs()
|
||||
df3 = df.copy()
|
||||
|
||||
mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]
|
||||
|
||||
assert(ray_df_equals_pandas(pd.concat(mixed_dfs, join='inner'),
|
||||
pandas.concat([df, df2, df3], join='inner')))
|
||||
|
||||
|
||||
def test_mixed_none_concat():
|
||||
df, df2 = generate_none_dfs()
|
||||
df3 = df.copy()
|
||||
|
||||
mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]
|
||||
|
||||
assert(ray_df_equals_pandas(pd.concat(mixed_dfs),
|
||||
pandas.concat([df, df2, df3])))
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,589 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
import pandas
|
||||
import numpy as np
|
||||
import ray.dataframe as pd
|
||||
from ray.dataframe.utils import (
|
||||
from_pandas,
|
||||
to_pandas)
|
||||
|
||||
PY2 = False
|
||||
if sys.version_info.major < 3:
|
||||
PY2 = True
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_df_equals_pandas(ray_df, pandas_df):
|
||||
assert isinstance(ray_df, pd.DataFrame)
|
||||
assert to_pandas(ray_df).equals(pandas_df)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_df_almost_equals_pandas(ray_df, pandas_df):
|
||||
assert isinstance(ray_df, pd.DataFrame)
|
||||
difference = to_pandas(ray_df) - pandas_df
|
||||
diff_max = difference.max().max()
|
||||
assert to_pandas(ray_df).equals(pandas_df) or diff_max < 0.0001
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_series_equals_pandas(ray_df, pandas_df):
|
||||
assert ray_df.equals(pandas_df)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_df_equals(ray_df1, ray_df2):
|
||||
assert to_pandas(ray_df1).equals(to_pandas(ray_df2))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_groupby_equals_pandas(ray_groupby, pandas_groupby):
|
||||
for g1, g2 in zip(ray_groupby, pandas_groupby):
|
||||
assert g1[0] == g2[0]
|
||||
ray_df_equals_pandas(g1[1], g2[1])
|
||||
|
||||
|
||||
def test_simple_row_groupby():
|
||||
pandas_df = pandas.DataFrame({'col1': [0, 1, 2, 3],
|
||||
'col2': [4, 5, 6, 7],
|
||||
'col3': [3, 8, 12, 10],
|
||||
'col4': [17, 13, 16, 15],
|
||||
'col5': [-4, -5, -6, -7]})
|
||||
|
||||
ray_df = from_pandas(pandas_df, 2)
|
||||
|
||||
by = [1, 2, 1, 2]
|
||||
n = 1
|
||||
|
||||
ray_groupby = ray_df.groupby(by=by)
|
||||
pandas_groupby = pandas_df.groupby(by=by)
|
||||
|
||||
ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
|
||||
test_ngroups(ray_groupby, pandas_groupby)
|
||||
test_skew(ray_groupby, pandas_groupby)
|
||||
test_ffill(ray_groupby, pandas_groupby)
|
||||
test_sem(ray_groupby, pandas_groupby)
|
||||
test_mean(ray_groupby, pandas_groupby)
|
||||
test_any(ray_groupby, pandas_groupby)
|
||||
test_min(ray_groupby, pandas_groupby)
|
||||
test_idxmax(ray_groupby, pandas_groupby)
|
||||
test_ndim(ray_groupby, pandas_groupby)
|
||||
test_cumsum(ray_groupby, pandas_groupby)
|
||||
test_pct_change(ray_groupby, pandas_groupby)
|
||||
test_cummax(ray_groupby, pandas_groupby)
|
||||
|
||||
apply_functions = [lambda df: df.sum(), lambda df: -df]
|
||||
for func in apply_functions:
|
||||
test_apply(ray_groupby, pandas_groupby, func)
|
||||
|
||||
test_dtypes(ray_groupby, pandas_groupby)
|
||||
test_first(ray_groupby, pandas_groupby)
|
||||
test_backfill(ray_groupby, pandas_groupby)
|
||||
test_cummin(ray_groupby, pandas_groupby)
|
||||
test_bfill(ray_groupby, pandas_groupby)
|
||||
test_idxmin(ray_groupby, pandas_groupby)
|
||||
test_prod(ray_groupby, pandas_groupby)
|
||||
test_std(ray_groupby, pandas_groupby)
|
||||
|
||||
agg_functions = ['min', 'max']
|
||||
for func in agg_functions:
|
||||
test_agg(ray_groupby, pandas_groupby, func)
|
||||
test_aggregate(ray_groupby, pandas_groupby, func)
|
||||
|
||||
test_last(ray_groupby, pandas_groupby)
|
||||
test_mad(ray_groupby, pandas_groupby)
|
||||
test_rank(ray_groupby, pandas_groupby)
|
||||
test_max(ray_groupby, pandas_groupby)
|
||||
test_var(ray_groupby, pandas_groupby)
|
||||
test_len(ray_groupby, pandas_groupby)
|
||||
test_sum(ray_groupby, pandas_groupby)
|
||||
test_ngroup(ray_groupby, pandas_groupby)
|
||||
test_nunique(ray_groupby, pandas_groupby)
|
||||
test_median(ray_groupby, pandas_groupby)
|
||||
test_head(ray_groupby, pandas_groupby, n)
|
||||
test_cumprod(ray_groupby, pandas_groupby)
|
||||
test_cov(ray_groupby, pandas_groupby)
|
||||
|
||||
transform_functions = [lambda df: df + 4, lambda df: -df - 10]
|
||||
for func in transform_functions:
|
||||
test_transform(ray_groupby, pandas_groupby, func)
|
||||
|
||||
pipe_functions = [lambda dfgb: dfgb.sum()]
|
||||
for func in pipe_functions:
|
||||
test_pipe(ray_groupby, pandas_groupby, func)
|
||||
|
||||
test_corr(ray_groupby, pandas_groupby)
|
||||
test_fillna(ray_groupby, pandas_groupby)
|
||||
test_count(ray_groupby, pandas_groupby)
|
||||
test_tail(ray_groupby, pandas_groupby, n)
|
||||
test_quantile(ray_groupby, pandas_groupby)
|
||||
test_take(ray_groupby, pandas_groupby)
|
||||
|
||||
|
||||
def test_single_group_row_groupby():
|
||||
pandas_df = pandas.DataFrame({'col1': [0, 1, 2, 3],
|
||||
'col2': [4, 5, 36, 7],
|
||||
'col3': [3, 8, 12, 10],
|
||||
'col4': [17, 3, 16, 15],
|
||||
'col5': [-4, 5, -6, -7]})
|
||||
|
||||
ray_df = from_pandas(pandas_df, 2)
|
||||
|
||||
by = [1, 1, 1, 1]
|
||||
n = 6
|
||||
|
||||
ray_groupby = ray_df.groupby(by=by)
|
||||
pandas_groupby = pandas_df.groupby(by=by)
|
||||
|
||||
ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
|
||||
test_ngroups(ray_groupby, pandas_groupby)
|
||||
test_skew(ray_groupby, pandas_groupby)
|
||||
test_ffill(ray_groupby, pandas_groupby)
|
||||
test_sem(ray_groupby, pandas_groupby)
|
||||
test_mean(ray_groupby, pandas_groupby)
|
||||
test_any(ray_groupby, pandas_groupby)
|
||||
test_min(ray_groupby, pandas_groupby)
|
||||
test_idxmax(ray_groupby, pandas_groupby)
|
||||
test_ndim(ray_groupby, pandas_groupby)
|
||||
test_cumsum(ray_groupby, pandas_groupby)
|
||||
test_pct_change(ray_groupby, pandas_groupby)
|
||||
test_cummax(ray_groupby, pandas_groupby)
|
||||
|
||||
apply_functions = [lambda df: df.sum(), lambda df: -df]
|
||||
for func in apply_functions:
|
||||
test_apply(ray_groupby, pandas_groupby, func)
|
||||
|
||||
test_dtypes(ray_groupby, pandas_groupby)
|
||||
test_first(ray_groupby, pandas_groupby)
|
||||
test_backfill(ray_groupby, pandas_groupby)
|
||||
test_cummin(ray_groupby, pandas_groupby)
|
||||
test_bfill(ray_groupby, pandas_groupby)
|
||||
test_idxmin(ray_groupby, pandas_groupby)
|
||||
test_prod(ray_groupby, pandas_groupby)
|
||||
test_std(ray_groupby, pandas_groupby)
|
||||
|
||||
agg_functions = ['min', 'max']
|
||||
for func in agg_functions:
|
||||
test_agg(ray_groupby, pandas_groupby, func)
|
||||
test_aggregate(ray_groupby, pandas_groupby, func)
|
||||
|
||||
test_last(ray_groupby, pandas_groupby)
|
||||
test_mad(ray_groupby, pandas_groupby)
|
||||
test_rank(ray_groupby, pandas_groupby)
|
||||
test_max(ray_groupby, pandas_groupby)
|
||||
test_var(ray_groupby, pandas_groupby)
|
||||
test_len(ray_groupby, pandas_groupby)
|
||||
test_sum(ray_groupby, pandas_groupby)
|
||||
test_ngroup(ray_groupby, pandas_groupby)
|
||||
test_nunique(ray_groupby, pandas_groupby)
|
||||
test_median(ray_groupby, pandas_groupby)
|
||||
test_head(ray_groupby, pandas_groupby, n)
|
||||
test_cumprod(ray_groupby, pandas_groupby)
|
||||
test_cov(ray_groupby, pandas_groupby)
|
||||
|
||||
transform_functions = [lambda df: df + 4, lambda df: -df - 10]
|
||||
for func in transform_functions:
|
||||
test_transform(ray_groupby, pandas_groupby, func)
|
||||
|
||||
pipe_functions = [lambda dfgb: dfgb.sum()]
|
||||
for func in pipe_functions:
|
||||
test_pipe(ray_groupby, pandas_groupby, func)
|
||||
|
||||
test_corr(ray_groupby, pandas_groupby)
|
||||
test_fillna(ray_groupby, pandas_groupby)
|
||||
test_count(ray_groupby, pandas_groupby)
|
||||
test_tail(ray_groupby, pandas_groupby, n)
|
||||
test_quantile(ray_groupby, pandas_groupby)
|
||||
test_take(ray_groupby, pandas_groupby)
|
||||
|
||||
|
||||
def test_large_row_groupby():
|
||||
pandas_df = pandas.DataFrame(np.random.randint(0, 8, size=(100, 4)),
|
||||
columns=list('ABCD'))
|
||||
|
||||
ray_df = from_pandas(pandas_df, 2)
|
||||
|
||||
by = pandas_df['A'].tolist()
|
||||
n = 4
|
||||
|
||||
ray_groupby = ray_df.groupby(by=by)
|
||||
pandas_groupby = pandas_df.groupby(by=by)
|
||||
|
||||
ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
|
||||
test_ngroups(ray_groupby, pandas_groupby)
|
||||
test_skew(ray_groupby, pandas_groupby)
|
||||
test_ffill(ray_groupby, pandas_groupby)
|
||||
test_sem(ray_groupby, pandas_groupby)
|
||||
test_mean(ray_groupby, pandas_groupby)
|
||||
test_any(ray_groupby, pandas_groupby)
|
||||
test_min(ray_groupby, pandas_groupby)
|
||||
test_idxmax(ray_groupby, pandas_groupby)
|
||||
test_ndim(ray_groupby, pandas_groupby)
|
||||
test_cumsum(ray_groupby, pandas_groupby)
|
||||
test_pct_change(ray_groupby, pandas_groupby)
|
||||
test_cummax(ray_groupby, pandas_groupby)
|
||||
|
||||
apply_functions = [lambda df: df.sum(), lambda df: -df]
|
||||
for func in apply_functions:
|
||||
test_apply(ray_groupby, pandas_groupby, func)
|
||||
|
||||
test_dtypes(ray_groupby, pandas_groupby)
|
||||
test_first(ray_groupby, pandas_groupby)
|
||||
test_backfill(ray_groupby, pandas_groupby)
|
||||
test_cummin(ray_groupby, pandas_groupby)
|
||||
test_bfill(ray_groupby, pandas_groupby)
|
||||
test_idxmin(ray_groupby, pandas_groupby)
|
||||
# test_prod(ray_groupby, pandas_groupby) causes overflows
|
||||
test_std(ray_groupby, pandas_groupby)
|
||||
|
||||
agg_functions = ['min', 'max']
|
||||
for func in agg_functions:
|
||||
test_agg(ray_groupby, pandas_groupby, func)
|
||||
test_aggregate(ray_groupby, pandas_groupby, func)
|
||||
|
||||
test_last(ray_groupby, pandas_groupby)
|
||||
test_mad(ray_groupby, pandas_groupby)
|
||||
test_rank(ray_groupby, pandas_groupby)
|
||||
test_max(ray_groupby, pandas_groupby)
|
||||
test_var(ray_groupby, pandas_groupby)
|
||||
test_len(ray_groupby, pandas_groupby)
|
||||
test_sum(ray_groupby, pandas_groupby)
|
||||
test_ngroup(ray_groupby, pandas_groupby)
|
||||
test_nunique(ray_groupby, pandas_groupby)
|
||||
test_median(ray_groupby, pandas_groupby)
|
||||
test_head(ray_groupby, pandas_groupby, n)
|
||||
# test_cumprod(ray_groupby, pandas_groupby) causes overflows
|
||||
test_cov(ray_groupby, pandas_groupby)
|
||||
|
||||
transform_functions = [lambda df: df + 4, lambda df: -df - 10]
|
||||
for func in transform_functions:
|
||||
test_transform(ray_groupby, pandas_groupby, func)
|
||||
|
||||
pipe_functions = [lambda dfgb: dfgb.sum()]
|
||||
for func in pipe_functions:
|
||||
test_pipe(ray_groupby, pandas_groupby, func)
|
||||
|
||||
test_corr(ray_groupby, pandas_groupby)
|
||||
test_fillna(ray_groupby, pandas_groupby)
|
||||
test_count(ray_groupby, pandas_groupby)
|
||||
test_tail(ray_groupby, pandas_groupby, n)
|
||||
test_quantile(ray_groupby, pandas_groupby)
|
||||
test_take(ray_groupby, pandas_groupby)
|
||||
|
||||
|
||||
def test_simple_col_groupby():
|
||||
pandas_df = pandas.DataFrame({'col1': [0, 3, 2, 3],
|
||||
'col2': [4, 1, 6, 7],
|
||||
'col3': [3, 8, 2, 10],
|
||||
'col4': [1, 13, 6, 15],
|
||||
'col5': [-4, 5, 6, -7]})
|
||||
|
||||
ray_df = from_pandas(pandas_df, 2)
|
||||
|
||||
by = [1, 2, 3, 2, 1]
|
||||
|
||||
ray_groupby = ray_df.groupby(axis=1, by=by)
|
||||
pandas_groupby = pandas_df.groupby(axis=1, by=by)
|
||||
|
||||
ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
|
||||
test_ngroups(ray_groupby, pandas_groupby)
|
||||
test_skew(ray_groupby, pandas_groupby)
|
||||
test_ffill(ray_groupby, pandas_groupby)
|
||||
test_sem(ray_groupby, pandas_groupby)
|
||||
test_mean(ray_groupby, pandas_groupby)
|
||||
test_any(ray_groupby, pandas_groupby)
|
||||
test_min(ray_groupby, pandas_groupby)
|
||||
test_ndim(ray_groupby, pandas_groupby)
|
||||
|
||||
if not PY2:
|
||||
# idxmax and idxmin fail on column groupby in pandas with python2
|
||||
test_idxmax(ray_groupby, pandas_groupby)
|
||||
test_idxmin(ray_groupby, pandas_groupby)
|
||||
test_rank(ray_groupby, pandas_groupby)
|
||||
test_quantile(ray_groupby, pandas_groupby)
|
||||
|
||||
# https://github.com/pandas-dev/pandas/issues/21127
|
||||
# test_cumsum(ray_groupby, pandas_groupby)
|
||||
# test_cummax(ray_groupby, pandas_groupby)
|
||||
# test_cummin(ray_groupby, pandas_groupby)
|
||||
# test_cumprod(ray_groupby, pandas_groupby)
|
||||
|
||||
test_pct_change(ray_groupby, pandas_groupby)
|
||||
apply_functions = [lambda df: -df, lambda df: df.sum(axis=1)]
|
||||
for func in apply_functions:
|
||||
test_apply(ray_groupby, pandas_groupby, func)
|
||||
|
||||
test_first(ray_groupby, pandas_groupby)
|
||||
test_backfill(ray_groupby, pandas_groupby)
|
||||
test_bfill(ray_groupby, pandas_groupby)
|
||||
test_prod(ray_groupby, pandas_groupby)
|
||||
test_std(ray_groupby, pandas_groupby)
|
||||
test_last(ray_groupby, pandas_groupby)
|
||||
test_mad(ray_groupby, pandas_groupby)
|
||||
test_max(ray_groupby, pandas_groupby)
|
||||
test_var(ray_groupby, pandas_groupby)
|
||||
test_len(ray_groupby, pandas_groupby)
|
||||
test_sum(ray_groupby, pandas_groupby)
|
||||
|
||||
# Pandas fails on this case with ValueError
|
||||
# test_ngroup(ray_groupby, pandas_groupby)
|
||||
# test_nunique(ray_groupby, pandas_groupby)
|
||||
test_median(ray_groupby, pandas_groupby)
|
||||
test_cov(ray_groupby, pandas_groupby)
|
||||
|
||||
transform_functions = [lambda df: df + 4, lambda df: -df - 10]
|
||||
for func in transform_functions:
|
||||
test_transform(ray_groupby, pandas_groupby, func)
|
||||
|
||||
pipe_functions = [lambda dfgb: dfgb.sum()]
|
||||
for func in pipe_functions:
|
||||
test_pipe(ray_groupby, pandas_groupby, func)
|
||||
|
||||
test_corr(ray_groupby, pandas_groupby)
|
||||
test_fillna(ray_groupby, pandas_groupby)
|
||||
test_count(ray_groupby, pandas_groupby)
|
||||
test_take(ray_groupby, pandas_groupby)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_ngroups(ray_groupby, pandas_groupby):
|
||||
assert ray_groupby.ngroups == pandas_groupby.ngroups
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_skew(ray_groupby, pandas_groupby):
|
||||
ray_df_almost_equals_pandas(ray_groupby.skew(), pandas_groupby.skew())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_ffill(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.ffill(), pandas_groupby.ffill())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_sem(ray_groupby, pandas_groupby):
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_groupby.sem()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_mean(ray_groupby, pandas_groupby):
|
||||
ray_df_almost_equals_pandas(ray_groupby.mean(), pandas_groupby.mean())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_any(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.any(), pandas_groupby.any())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_min(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.min(), pandas_groupby.min())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_idxmax(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.idxmax(), pandas_groupby.idxmax())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_ndim(ray_groupby, pandas_groupby):
|
||||
assert ray_groupby.ndim == pandas_groupby.ndim
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_cumsum(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.cumsum(), pandas_groupby.cumsum())
|
||||
ray_df_equals_pandas(ray_groupby.cumsum(axis=1),
|
||||
pandas_groupby.cumsum(axis=1))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_pct_change(ray_groupby, pandas_groupby):
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_groupby.pct_change()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_cummax(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.cummax(), pandas_groupby.cummax())
|
||||
ray_df_equals_pandas(ray_groupby.cummax(axis=1),
|
||||
pandas_groupby.cummax(axis=1))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_apply(ray_groupby, pandas_groupby, func):
|
||||
ray_df_equals_pandas(ray_groupby.apply(func), pandas_groupby.apply(func))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_dtypes(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.dtypes, pandas_groupby.dtypes)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_first(ray_groupby, pandas_groupby):
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_groupby.first()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_backfill(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.backfill(), pandas_groupby.backfill())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_cummin(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.cummin(), pandas_groupby.cummin())
|
||||
ray_df_equals_pandas(ray_groupby.cummin(axis=1),
|
||||
pandas_groupby.cummin(axis=1))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_bfill(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.bfill(), pandas_groupby.bfill())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_idxmin(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.idxmin(), pandas_groupby.idxmin())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_prod(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.prod(), pandas_groupby.prod())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_std(ray_groupby, pandas_groupby):
|
||||
ray_df_almost_equals_pandas(ray_groupby.std(), pandas_groupby.std())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_aggregate(ray_groupby, pandas_groupby, func):
|
||||
ray_df_equals_pandas(ray_groupby.aggregate(func),
|
||||
pandas_groupby.aggregate(func))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_agg(ray_groupby, pandas_groupby, func):
|
||||
ray_df_equals_pandas(ray_groupby.agg(func), pandas_groupby.agg(func))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_last(ray_groupby, pandas_groupby):
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_groupby.last()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_mad(ray_groupby, pandas_groupby):
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_groupby.mad()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_rank(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.rank(), pandas_groupby.rank())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_max(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.max(), pandas_groupby.max())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_var(ray_groupby, pandas_groupby):
|
||||
ray_df_almost_equals_pandas(ray_groupby.var(), pandas_groupby.var())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_len(ray_groupby, pandas_groupby):
|
||||
assert len(ray_groupby) == len(pandas_groupby)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_sum(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.sum(), pandas_groupby.sum())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_ngroup(ray_groupby, pandas_groupby):
|
||||
ray_series_equals_pandas(ray_groupby.ngroup(), pandas_groupby.ngroup())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_nunique(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.nunique(), pandas_groupby.nunique())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_median(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.median(), pandas_groupby.median())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_head(ray_groupby, pandas_groupby, n):
|
||||
ray_df_equals_pandas(ray_groupby.head(n=n), pandas_groupby.head(n=n))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_cumprod(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.cumprod(), pandas_groupby.cumprod())
|
||||
ray_df_equals_pandas(ray_groupby.cumprod(axis=1),
|
||||
pandas_groupby.cumprod(axis=1))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_cov(ray_groupby, pandas_groupby):
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_groupby.cov()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_transform(ray_groupby, pandas_groupby, func):
|
||||
ray_df_equals_pandas(ray_groupby.transform(func),
|
||||
pandas_groupby.transform(func))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_corr(ray_groupby, pandas_groupby):
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_groupby.corr()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_fillna(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.fillna(method="ffill"),
|
||||
pandas_groupby.fillna(method="ffill"))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_count(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.count(), pandas_groupby.count())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_pipe(ray_groupby, pandas_groupby, func):
|
||||
ray_df_equals_pandas(ray_groupby.pipe(func), pandas_groupby.pipe(func))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_tail(ray_groupby, pandas_groupby, n):
|
||||
ray_df_equals_pandas(ray_groupby.tail(n=n), pandas_groupby.tail(n=n))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_quantile(ray_groupby, pandas_groupby):
|
||||
ray_df_equals_pandas(ray_groupby.quantile(q=0.4),
|
||||
pandas_groupby.quantile(q=0.4))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_take(ray_groupby, pandas_groupby):
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_groupby.take(indices=[1])
|
||||
@@ -1,675 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas
|
||||
from ray.dataframe.utils import to_pandas
|
||||
import ray.dataframe as pd
|
||||
import os
|
||||
import sqlite3
|
||||
|
||||
TEST_PARQUET_FILENAME = 'test.parquet'
|
||||
TEST_CSV_FILENAME = 'test.csv'
|
||||
TEST_JSON_FILENAME = 'test.json'
|
||||
TEST_HTML_FILENAME = 'test.html'
|
||||
TEST_EXCEL_FILENAME = 'test.xlsx'
|
||||
TEST_FEATHER_FILENAME = 'test.feather'
|
||||
TEST_HDF_FILENAME = 'test.hdf'
|
||||
TEST_MSGPACK_FILENAME = 'test.msg'
|
||||
TEST_STATA_FILENAME = 'test.dta'
|
||||
TEST_PICKLE_FILENAME = 'test.pkl'
|
||||
TEST_SAS_FILENAME = os.getcwd() + '/data/test1.sas7bdat'
|
||||
TEST_SQL_FILENAME = 'test.db'
|
||||
SMALL_ROW_SIZE = 2000
|
||||
LARGE_ROW_SIZE = 7e6
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_df_equals_pandas(ray_df, pandas_df):
|
||||
return to_pandas(ray_df).sort_index().equals(pandas_df.sort_index())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_parquet_file(row_size, force=False):
|
||||
if os.path.exists(TEST_PARQUET_FILENAME) and not force:
|
||||
pass
|
||||
else:
|
||||
df = pandas.DataFrame({
|
||||
'col1': np.arange(row_size),
|
||||
'col2': np.arange(row_size)
|
||||
})
|
||||
df.to_parquet(TEST_PARQUET_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def create_test_ray_dataframe():
|
||||
df = pd.DataFrame({'col1': [0, 1, 2, 3],
|
||||
'col2': [4, 5, 6, 7],
|
||||
'col3': [8, 9, 10, 11],
|
||||
'col4': [12, 13, 14, 15],
|
||||
'col5': [0, 0, 0, 0]})
|
||||
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def create_test_pandas_dataframe():
|
||||
df = pandas.DataFrame({'col1': [0, 1, 2, 3],
|
||||
'col2': [4, 5, 6, 7],
|
||||
'col3': [8, 9, 10, 11],
|
||||
'col4': [12, 13, 14, 15],
|
||||
'col5': [0, 0, 0, 0]})
|
||||
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_files_eq(path1, path2):
|
||||
with open(path1, 'rb') as file1, open(path2, 'rb') as file2:
|
||||
file1_content = file1.read()
|
||||
file2_content = file2.read()
|
||||
|
||||
if file1_content == file2_content:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def teardown_test_file(test_path):
|
||||
if os.path.exists(test_path):
|
||||
os.remove(test_path)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def teardown_parquet_file():
|
||||
if os.path.exists(TEST_PARQUET_FILENAME):
|
||||
os.remove(TEST_PARQUET_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_csv_file(row_size, force=False, delimiter=','):
|
||||
if os.path.exists(TEST_CSV_FILENAME) and not force:
|
||||
pass
|
||||
else:
|
||||
df = pandas.DataFrame({
|
||||
'col1': np.arange(row_size),
|
||||
'col2': np.arange(row_size)
|
||||
})
|
||||
df.to_csv(TEST_CSV_FILENAME, sep=delimiter)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def teardown_csv_file():
|
||||
if os.path.exists(TEST_CSV_FILENAME):
|
||||
os.remove(TEST_CSV_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_json_file(row_size, force=False):
|
||||
if os.path.exists(TEST_JSON_FILENAME) and not force:
|
||||
pass
|
||||
else:
|
||||
df = pandas.DataFrame({
|
||||
'col1': np.arange(row_size),
|
||||
'col2': np.arange(row_size)
|
||||
})
|
||||
df.to_json(TEST_JSON_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def teardown_json_file():
|
||||
if os.path.exists(TEST_JSON_FILENAME):
|
||||
os.remove(TEST_JSON_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_html_file(row_size, force=False):
|
||||
if os.path.exists(TEST_HTML_FILENAME) and not force:
|
||||
pass
|
||||
else:
|
||||
df = pandas.DataFrame({
|
||||
'col1': np.arange(row_size),
|
||||
'col2': np.arange(row_size)
|
||||
})
|
||||
df.to_html(TEST_HTML_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def teardown_html_file():
|
||||
if os.path.exists(TEST_HTML_FILENAME):
|
||||
os.remove(TEST_HTML_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_clipboard(row_size, force=False):
|
||||
df = pandas.DataFrame({
|
||||
'col1': np.arange(row_size),
|
||||
'col2': np.arange(row_size)
|
||||
})
|
||||
df.to_clipboard()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_excel_file(row_size, force=False):
|
||||
if os.path.exists(TEST_EXCEL_FILENAME) and not force:
|
||||
pass
|
||||
else:
|
||||
df = pandas.DataFrame({
|
||||
'col1': np.arange(row_size),
|
||||
'col2': np.arange(row_size)
|
||||
})
|
||||
df.to_excel(TEST_EXCEL_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def teardown_excel_file():
|
||||
if os.path.exists(TEST_EXCEL_FILENAME):
|
||||
os.remove(TEST_EXCEL_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_feather_file(row_size, force=False):
|
||||
if os.path.exists(TEST_FEATHER_FILENAME) and not force:
|
||||
pass
|
||||
else:
|
||||
df = pandas.DataFrame({
|
||||
'col1': np.arange(row_size),
|
||||
'col2': np.arange(row_size)
|
||||
})
|
||||
df.to_feather(TEST_FEATHER_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def teardown_feather_file():
|
||||
if os.path.exists(TEST_FEATHER_FILENAME):
|
||||
os.remove(TEST_FEATHER_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_hdf_file(row_size, force=False):
|
||||
if os.path.exists(TEST_HDF_FILENAME) and not force:
|
||||
pass
|
||||
else:
|
||||
df = pandas.DataFrame({
|
||||
'col1': np.arange(row_size),
|
||||
'col2': np.arange(row_size)
|
||||
})
|
||||
df.to_hdf(TEST_HDF_FILENAME, 'test')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def teardown_hdf_file():
|
||||
if os.path.exists(TEST_HDF_FILENAME):
|
||||
os.remove(TEST_HDF_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_msgpack_file(row_size, force=False):
|
||||
if os.path.exists(TEST_MSGPACK_FILENAME) and not force:
|
||||
pass
|
||||
else:
|
||||
df = pandas.DataFrame({
|
||||
'col1': np.arange(row_size),
|
||||
'col2': np.arange(row_size)
|
||||
})
|
||||
df.to_msgpack(TEST_MSGPACK_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def teardown_msgpack_file():
|
||||
if os.path.exists(TEST_MSGPACK_FILENAME):
|
||||
os.remove(TEST_MSGPACK_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_stata_file(row_size, force=False):
|
||||
if os.path.exists(TEST_STATA_FILENAME) and not force:
|
||||
pass
|
||||
else:
|
||||
df = pandas.DataFrame({
|
||||
'col1': np.arange(row_size),
|
||||
'col2': np.arange(row_size)
|
||||
})
|
||||
df.to_stata(TEST_STATA_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def teardown_stata_file():
|
||||
if os.path.exists(TEST_STATA_FILENAME):
|
||||
os.remove(TEST_STATA_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_pickle_file(row_size, force=False):
|
||||
if os.path.exists(TEST_PICKLE_FILENAME) and not force:
|
||||
pass
|
||||
else:
|
||||
df = pandas.DataFrame({
|
||||
'col1': np.arange(row_size),
|
||||
'col2': np.arange(row_size)
|
||||
})
|
||||
df.to_pickle(TEST_PICKLE_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def teardown_pickle_file():
|
||||
if os.path.exists(TEST_PICKLE_FILENAME):
|
||||
os.remove(TEST_PICKLE_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_sql_file(conn, force=False):
|
||||
if os.path.exists(TEST_SQL_FILENAME) and not force:
|
||||
pass
|
||||
else:
|
||||
df = pandas.DataFrame({'col1': [0, 1, 2, 3],
|
||||
'col2': [4, 5, 6, 7],
|
||||
'col3': [8, 9, 10, 11],
|
||||
'col4': [12, 13, 14, 15],
|
||||
'col5': [0, 0, 0, 0]})
|
||||
df.to_sql(TEST_SQL_FILENAME.split(".")[0], conn)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def teardown_sql_file():
|
||||
if os.path.exists(TEST_SQL_FILENAME):
|
||||
os.remove(TEST_SQL_FILENAME)
|
||||
|
||||
|
||||
def test_from_parquet_small():
|
||||
|
||||
setup_parquet_file(SMALL_ROW_SIZE)
|
||||
|
||||
pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
|
||||
ray_df = pd.read_parquet(TEST_PARQUET_FILENAME)
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
teardown_parquet_file()
|
||||
|
||||
|
||||
def test_from_parquet_large():
|
||||
setup_parquet_file(LARGE_ROW_SIZE)
|
||||
|
||||
pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
|
||||
ray_df = pd.read_parquet(TEST_PARQUET_FILENAME)
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
teardown_parquet_file()
|
||||
|
||||
|
||||
def test_from_csv():
|
||||
setup_csv_file(SMALL_ROW_SIZE)
|
||||
|
||||
pandas_df = pandas.read_csv(TEST_CSV_FILENAME)
|
||||
ray_df = pd.read_csv(TEST_CSV_FILENAME)
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
teardown_csv_file()
|
||||
|
||||
|
||||
def test_from_json():
|
||||
setup_json_file(SMALL_ROW_SIZE)
|
||||
|
||||
pandas_df = pandas.read_json(TEST_JSON_FILENAME)
|
||||
ray_df = pd.read_json(TEST_JSON_FILENAME)
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
teardown_json_file()
|
||||
|
||||
|
||||
def test_from_html():
|
||||
setup_html_file(SMALL_ROW_SIZE)
|
||||
|
||||
pandas_df = pandas.read_html(TEST_HTML_FILENAME)[0]
|
||||
ray_df = pd.read_html(TEST_HTML_FILENAME)
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
teardown_html_file()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="No clipboard on Travis")
|
||||
def test_from_clipboard():
|
||||
setup_clipboard(SMALL_ROW_SIZE)
|
||||
|
||||
pandas_df = pandas.read_clipboard()
|
||||
ray_df = pd.read_clipboard()
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
|
||||
def test_from_excel():
|
||||
setup_excel_file(SMALL_ROW_SIZE)
|
||||
|
||||
pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME)
|
||||
ray_df = pd.read_excel(TEST_EXCEL_FILENAME)
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
teardown_excel_file()
|
||||
|
||||
|
||||
def test_from_feather():
|
||||
setup_feather_file(SMALL_ROW_SIZE)
|
||||
|
||||
pandas_df = pandas.read_feather(TEST_FEATHER_FILENAME)
|
||||
ray_df = pd.read_feather(TEST_FEATHER_FILENAME)
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
teardown_feather_file()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Memory overflow on Travis")
|
||||
def test_from_hdf():
|
||||
setup_hdf_file(SMALL_ROW_SIZE)
|
||||
|
||||
pandas_df = pandas.read_hdf(TEST_HDF_FILENAME, key='test')
|
||||
ray_df = pd.read_hdf(TEST_HDF_FILENAME, key='test')
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
teardown_hdf_file()
|
||||
|
||||
|
||||
def test_from_msgpack():
|
||||
setup_msgpack_file(SMALL_ROW_SIZE)
|
||||
|
||||
pandas_df = pandas.read_msgpack(TEST_MSGPACK_FILENAME)
|
||||
ray_df = pd.read_msgpack(TEST_MSGPACK_FILENAME)
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
teardown_msgpack_file()
|
||||
|
||||
|
||||
def test_from_stata():
|
||||
setup_stata_file(SMALL_ROW_SIZE)
|
||||
|
||||
pandas_df = pandas.read_stata(TEST_STATA_FILENAME)
|
||||
ray_df = pd.read_stata(TEST_STATA_FILENAME)
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
teardown_stata_file()
|
||||
|
||||
|
||||
def test_from_pickle():
|
||||
setup_pickle_file(SMALL_ROW_SIZE)
|
||||
|
||||
pandas_df = pandas.read_pickle(TEST_PICKLE_FILENAME)
|
||||
ray_df = pd.read_pickle(TEST_PICKLE_FILENAME)
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
teardown_pickle_file()
|
||||
|
||||
|
||||
def test_from_sql():
|
||||
conn = sqlite3.connect(TEST_SQL_FILENAME)
|
||||
setup_sql_file(conn, True)
|
||||
|
||||
pandas_df = pandas.read_sql("select * from test", conn)
|
||||
ray_df = pd.read_sql("select * from test", conn)
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
teardown_sql_file()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="No SAS write methods in Pandas")
|
||||
def test_from_sas():
|
||||
pandas_df = pandas.read_sas(TEST_SAS_FILENAME)
|
||||
ray_df = pd.read_sas(TEST_SAS_FILENAME)
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
|
||||
def test_from_csv_delimiter():
|
||||
setup_csv_file(SMALL_ROW_SIZE, delimiter='|')
|
||||
|
||||
pandas_df = pandas.read_csv(TEST_CSV_FILENAME)
|
||||
ray_df = pd.read_csv(TEST_CSV_FILENAME)
|
||||
|
||||
assert ray_df_equals_pandas(ray_df, pandas_df)
|
||||
|
||||
teardown_csv_file()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="No clipboard on Travis")
|
||||
def test_to_clipboard():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
pandas_df = create_test_pandas_dataframe()
|
||||
|
||||
ray_df.to_clipboard()
|
||||
ray_as_clip = pandas.read_clipboard()
|
||||
|
||||
pandas_df.to_clipboard()
|
||||
pandas_as_clip = pandas.read_clipboard()
|
||||
|
||||
assert(ray_as_clip.equals(pandas_as_clip))
|
||||
|
||||
|
||||
def test_to_csv():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
pandas_df = create_test_pandas_dataframe()
|
||||
|
||||
TEST_CSV_DF_FILENAME = "test_df.csv"
|
||||
TEST_CSV_pandas_FILENAME = "test_pandas.csv"
|
||||
|
||||
ray_df.to_csv(TEST_CSV_DF_FILENAME)
|
||||
pandas_df.to_csv(TEST_CSV_pandas_FILENAME)
|
||||
|
||||
assert(test_files_eq(TEST_CSV_DF_FILENAME,
|
||||
TEST_CSV_pandas_FILENAME))
|
||||
|
||||
teardown_test_file(TEST_CSV_pandas_FILENAME)
|
||||
teardown_test_file(TEST_CSV_DF_FILENAME)
|
||||
|
||||
|
||||
def test_to_dense():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_df.to_dense()
|
||||
|
||||
|
||||
def test_to_dict():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_df.to_dict()
|
||||
|
||||
|
||||
def test_to_excel():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
pandas_df = create_test_pandas_dataframe()
|
||||
|
||||
TEST_EXCEL_DF_FILENAME = "test_df.xlsx"
|
||||
TEST_EXCEL_pandas_FILENAME = "test_pandas.xlsx"
|
||||
|
||||
ray_writer = pandas.ExcelWriter(TEST_EXCEL_DF_FILENAME)
|
||||
pandas_writer = pandas.ExcelWriter(TEST_EXCEL_pandas_FILENAME)
|
||||
|
||||
ray_df.to_excel(ray_writer)
|
||||
pandas_df.to_excel(pandas_writer)
|
||||
|
||||
ray_writer.save()
|
||||
pandas_writer.save()
|
||||
|
||||
assert(test_files_eq(TEST_EXCEL_DF_FILENAME,
|
||||
TEST_EXCEL_pandas_FILENAME))
|
||||
|
||||
teardown_test_file(TEST_EXCEL_DF_FILENAME)
|
||||
teardown_test_file(TEST_EXCEL_pandas_FILENAME)
|
||||
|
||||
|
||||
def test_to_feather():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
pandas_df = create_test_pandas_dataframe()
|
||||
|
||||
TEST_FEATHER_DF_FILENAME = "test_df.feather"
|
||||
TEST_FEATHER_pandas_FILENAME = "test_pandas.feather"
|
||||
|
||||
ray_df.to_feather(TEST_FEATHER_DF_FILENAME)
|
||||
pandas_df.to_feather(TEST_FEATHER_pandas_FILENAME)
|
||||
|
||||
assert(test_files_eq(TEST_FEATHER_DF_FILENAME,
|
||||
TEST_FEATHER_pandas_FILENAME))
|
||||
|
||||
teardown_test_file(TEST_FEATHER_pandas_FILENAME)
|
||||
teardown_test_file(TEST_FEATHER_DF_FILENAME)
|
||||
|
||||
|
||||
def test_to_gbq():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
|
||||
TEST_GBQ_DF_FILENAME = "test_df.gbq"
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_df.to_gbq(TEST_GBQ_DF_FILENAME, None)
|
||||
|
||||
|
||||
def test_to_html():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
pandas_df = create_test_pandas_dataframe()
|
||||
|
||||
TEST_HTML_DF_FILENAME = "test_df.html"
|
||||
TEST_HTML_pandas_FILENAME = "test_pandas.html"
|
||||
|
||||
ray_df.to_html(TEST_HTML_DF_FILENAME)
|
||||
pandas_df.to_html(TEST_HTML_pandas_FILENAME)
|
||||
|
||||
assert(test_files_eq(TEST_HTML_DF_FILENAME,
|
||||
TEST_HTML_pandas_FILENAME))
|
||||
|
||||
teardown_test_file(TEST_HTML_pandas_FILENAME)
|
||||
teardown_test_file(TEST_HTML_DF_FILENAME)
|
||||
|
||||
|
||||
def test_to_json():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
pandas_df = create_test_pandas_dataframe()
|
||||
|
||||
TEST_JSON_DF_FILENAME = "test_df.json"
|
||||
TEST_JSON_pandas_FILENAME = "test_pandas.json"
|
||||
|
||||
ray_df.to_json(TEST_JSON_DF_FILENAME)
|
||||
pandas_df.to_json(TEST_JSON_pandas_FILENAME)
|
||||
|
||||
assert(test_files_eq(TEST_JSON_DF_FILENAME,
|
||||
TEST_JSON_pandas_FILENAME))
|
||||
|
||||
teardown_test_file(TEST_JSON_pandas_FILENAME)
|
||||
teardown_test_file(TEST_JSON_DF_FILENAME)
|
||||
|
||||
|
||||
def test_to_latex():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_df.to_latex()
|
||||
|
||||
|
||||
def test_to_msgpack():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
pandas_df = create_test_pandas_dataframe()
|
||||
|
||||
TEST_MSGPACK_DF_FILENAME = "test_df.msgpack"
|
||||
TEST_MSGPACK_pandas_FILENAME = "test_pandas.msgpack"
|
||||
|
||||
ray_df.to_msgpack(TEST_MSGPACK_DF_FILENAME)
|
||||
pandas_df.to_msgpack(TEST_MSGPACK_pandas_FILENAME)
|
||||
|
||||
assert(test_files_eq(TEST_MSGPACK_DF_FILENAME,
|
||||
TEST_MSGPACK_pandas_FILENAME))
|
||||
|
||||
teardown_test_file(TEST_MSGPACK_pandas_FILENAME)
|
||||
teardown_test_file(TEST_MSGPACK_DF_FILENAME)
|
||||
|
||||
|
||||
def test_to_panel():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_df.to_panel()
|
||||
|
||||
|
||||
def test_to_parquet():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
pandas_df = create_test_pandas_dataframe()
|
||||
|
||||
TEST_PARQUET_DF_FILENAME = "test_df.parquet"
|
||||
TEST_PARQUET_pandas_FILENAME = "test_pandas.parquet"
|
||||
|
||||
ray_df.to_parquet(TEST_PARQUET_DF_FILENAME)
|
||||
pandas_df.to_parquet(TEST_PARQUET_pandas_FILENAME)
|
||||
|
||||
assert(test_files_eq(TEST_PARQUET_DF_FILENAME,
|
||||
TEST_PARQUET_pandas_FILENAME))
|
||||
|
||||
teardown_test_file(TEST_PARQUET_pandas_FILENAME)
|
||||
teardown_test_file(TEST_PARQUET_DF_FILENAME)
|
||||
|
||||
|
||||
def test_to_period():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_df.to_period()
|
||||
|
||||
|
||||
def test_to_pickle():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
pandas_df = create_test_pandas_dataframe()
|
||||
|
||||
TEST_PICKLE_DF_FILENAME = "test_df.pkl"
|
||||
TEST_PICKLE_pandas_FILENAME = "test_pandas.pkl"
|
||||
|
||||
ray_df.to_pickle(TEST_PICKLE_DF_FILENAME)
|
||||
pandas_df.to_pickle(TEST_PICKLE_pandas_FILENAME)
|
||||
|
||||
assert(test_files_eq(TEST_PICKLE_DF_FILENAME,
|
||||
TEST_PICKLE_pandas_FILENAME))
|
||||
|
||||
teardown_test_file(TEST_PICKLE_pandas_FILENAME)
|
||||
teardown_test_file(TEST_PICKLE_DF_FILENAME)
|
||||
|
||||
|
||||
def test_to_sql():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
pandas_df = create_test_pandas_dataframe()
|
||||
|
||||
TEST_SQL_DF_FILENAME = "test_df.sql"
|
||||
TEST_SQL_pandas_FILENAME = "test_pandas.sql"
|
||||
|
||||
ray_df.to_pickle(TEST_SQL_DF_FILENAME)
|
||||
pandas_df.to_pickle(TEST_SQL_pandas_FILENAME)
|
||||
|
||||
assert(test_files_eq(TEST_SQL_DF_FILENAME,
|
||||
TEST_SQL_pandas_FILENAME))
|
||||
|
||||
teardown_test_file(TEST_SQL_DF_FILENAME)
|
||||
teardown_test_file(TEST_SQL_pandas_FILENAME)
|
||||
|
||||
|
||||
def test_to_stata():
|
||||
ray_df = create_test_ray_dataframe()
|
||||
pandas_df = create_test_pandas_dataframe()
|
||||
|
||||
TEST_STATA_DF_FILENAME = "test_df.stata"
|
||||
TEST_STATA_pandas_FILENAME = "test_pandas.stata"
|
||||
|
||||
ray_df.to_stata(TEST_STATA_DF_FILENAME)
|
||||
pandas_df.to_stata(TEST_STATA_pandas_FILENAME)
|
||||
|
||||
assert(test_files_eq(TEST_STATA_DF_FILENAME,
|
||||
TEST_STATA_pandas_FILENAME))
|
||||
|
||||
teardown_test_file(TEST_STATA_pandas_FILENAME)
|
||||
teardown_test_file(TEST_STATA_DF_FILENAME)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,577 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import pandas
|
||||
import numpy as np
|
||||
import ray
|
||||
|
||||
from . import get_npartitions
|
||||
|
||||
|
||||
_NAN_BLOCKS = {}
|
||||
_MEMOIZER_CAPACITY = 1000 # Capacity per function
|
||||
|
||||
|
||||
class LRUCache:
|
||||
"""A LRUCache implemented with collections.OrderedDict
|
||||
|
||||
Notes:
|
||||
- OrderedDict will record the order each item is inserted.
|
||||
- The head of the queue will be LRU items.
|
||||
"""
|
||||
|
||||
def __init__(self, capacity):
|
||||
self.capacity = capacity
|
||||
self.cache = collections.OrderedDict()
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self.cache
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""Retrieve item from cache and re-insert it to the back of the queue
|
||||
"""
|
||||
value = self.cache.pop(key)
|
||||
self.cache[key] = value
|
||||
return value
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
if key in self.cache:
|
||||
self.cache.pop(key)
|
||||
|
||||
if len(self.cache) >= self.capacity:
|
||||
# Pop oldest items at the beginning of the queue
|
||||
self.cache.popitem(last=False)
|
||||
|
||||
self.cache[key] = value
|
||||
|
||||
|
||||
class memoize:
|
||||
"""A basic memoizer that cache the input and output of the remote function
|
||||
|
||||
Notes:
|
||||
- How is this implemented?
|
||||
This meoizer is implemented by adding a caching layer to the remote
|
||||
function's remote attribute. When user call f.remote(*args), we will
|
||||
first check against the cache, and then call the ray remote function
|
||||
if we can't find the return value in the cache.
|
||||
- When should this be used?
|
||||
This should be used when we anticipate temporal locality for the
|
||||
function. For example, we can reasonally assume users will perform
|
||||
columnar operation repetitively over time (like sum() or loc[]).
|
||||
- Caveat
|
||||
Don't use this decorator if the any argument to the remote function
|
||||
will mutate. Following snippet will fail
|
||||
```py
|
||||
@memoize
|
||||
@ray.remote
|
||||
def f(obj):
|
||||
...
|
||||
|
||||
mutable_obj = [1]
|
||||
oid_1 = f.remote(mutable_obj) # will be cached
|
||||
|
||||
mutable_obj.append(3)
|
||||
oid_2 = f.remote(mutable_obj) # cache hit!
|
||||
|
||||
oid_1 == oid_2 # True!
|
||||
```
|
||||
In short, use this function sparingly. The ideal case is that all
|
||||
inputs are ray ObjectIDs because they are immutable objects.
|
||||
- Future Development
|
||||
- Fix the mutability bug
|
||||
- Dynamic cache size (Fixed as 1000 for now)
|
||||
"""
|
||||
|
||||
def __init__(self, f):
|
||||
# Save of remote function
|
||||
self.old_remote_func = f.remote
|
||||
self.cache = LRUCache(capacity=_MEMOIZER_CAPACITY)
|
||||
|
||||
def remote(self, *args):
|
||||
"""Return cached result if the arguments are cached
|
||||
"""
|
||||
args = tuple(args)
|
||||
|
||||
if args in self.cache:
|
||||
cached_result = self.cache[args]
|
||||
return cached_result
|
||||
|
||||
result = self.old_remote_func(*args)
|
||||
self.cache[args] = result
|
||||
return result
|
||||
|
||||
|
||||
def _get_nan_block_id(n_row=1, n_col=1, transpose=False):
|
||||
"""A memory efficent way to get a block of NaNs.
|
||||
|
||||
Args:
|
||||
n_rows(int): number of rows
|
||||
n_col(int): number of columns
|
||||
transpose(bool): if true, swap rows and columns
|
||||
Returns:
|
||||
ObjectID of the NaN block
|
||||
"""
|
||||
global _NAN_BLOCKS
|
||||
if transpose:
|
||||
n_row, n_col = n_col, n_row
|
||||
shape = (n_row, n_col)
|
||||
if shape not in _NAN_BLOCKS:
|
||||
arr = np.tile(np.array(np.NaN), shape)
|
||||
_NAN_BLOCKS[shape] = ray.put(pandas.DataFrame(data=arr))
|
||||
return _NAN_BLOCKS[shape]
|
||||
|
||||
|
||||
def _get_lengths(df):
|
||||
"""Gets the length of the dataframe.
|
||||
Args:
|
||||
df: A remote pandas.DataFrame object.
|
||||
Returns:
|
||||
Returns an integer length of the dataframe object. If the attempt
|
||||
fails, returns 0 as the length.
|
||||
"""
|
||||
try:
|
||||
return len(df)
|
||||
# Because we sometimes have cases where we have summary statistics in our
|
||||
# DataFrames
|
||||
except TypeError:
|
||||
return 0
|
||||
|
||||
|
||||
def _get_widths(df):
|
||||
"""Gets the width (number of columns) of the dataframe.
|
||||
Args:
|
||||
df: A remote pandas.DataFrame object.
|
||||
Returns:
|
||||
Returns an integer width of the dataframe object. If the attempt
|
||||
fails, returns 0 as the length.
|
||||
"""
|
||||
try:
|
||||
return len(df.columns)
|
||||
# Because we sometimes have cases where we have summary statistics in our
|
||||
# DataFrames
|
||||
except TypeError:
|
||||
return 0
|
||||
|
||||
|
||||
def _partition_pandas_dataframe(df, num_partitions=None, row_chunksize=None):
|
||||
"""Partitions a Pandas DataFrame object.
|
||||
Args:
|
||||
df (pandas.DataFrame): The pandas DataFrame to convert.
|
||||
npartitions (int): The number of partitions to split the DataFrame
|
||||
into. Has priority over chunksize.
|
||||
row_chunksize (int): The number of rows to put in each partition.
|
||||
Returns:
|
||||
[ObjectID]: A list of object IDs corresponding to the dataframe
|
||||
partitions
|
||||
"""
|
||||
if num_partitions is not None:
|
||||
row_chunksize = len(df) // num_partitions \
|
||||
if len(df) % num_partitions == 0 \
|
||||
else len(df) // num_partitions + 1
|
||||
else:
|
||||
assert row_chunksize is not None
|
||||
|
||||
temp_df = df
|
||||
|
||||
row_partitions = []
|
||||
while len(temp_df) > row_chunksize:
|
||||
t_df = temp_df[:row_chunksize]
|
||||
# reset_index here because we want a pandas.RangeIndex
|
||||
# within the partitions. It is smaller and sometimes faster.
|
||||
t_df.reset_index(drop=True, inplace=True)
|
||||
t_df.columns = pandas.RangeIndex(0, len(t_df.columns))
|
||||
top = ray.put(t_df)
|
||||
row_partitions.append(top)
|
||||
temp_df = temp_df[row_chunksize:]
|
||||
else:
|
||||
# Handle the last chunk correctly.
|
||||
# This call is necessary to prevent modifying original df
|
||||
temp_df = temp_df[:]
|
||||
temp_df.reset_index(drop=True, inplace=True)
|
||||
temp_df.columns = pandas.RangeIndex(0, len(temp_df.columns))
|
||||
row_partitions.append(ray.put(temp_df))
|
||||
|
||||
return row_partitions
|
||||
|
||||
|
||||
def from_pandas(df, num_partitions=None, chunksize=None):
|
||||
"""Converts a pandas DataFrame to a Ray DataFrame.
|
||||
Args:
|
||||
df (pandas.DataFrame): The pandas DataFrame to convert.
|
||||
num_partitions (int): The number of partitions to split the DataFrame
|
||||
into. Has priority over chunksize.
|
||||
chunksize (int): The number of rows to put in each partition.
|
||||
Returns:
|
||||
A new Ray DataFrame object.
|
||||
"""
|
||||
from .dataframe import DataFrame
|
||||
|
||||
row_partitions = \
|
||||
_partition_pandas_dataframe(df, num_partitions, chunksize)
|
||||
|
||||
return DataFrame(row_partitions=row_partitions,
|
||||
columns=df.columns,
|
||||
index=df.index)
|
||||
|
||||
|
||||
def to_pandas(df):
|
||||
"""Converts a Ray DataFrame to a pandas DataFrame/Series.
|
||||
Args:
|
||||
df (ray.DataFrame): The Ray DataFrame to convert.
|
||||
Returns:
|
||||
A new pandas DataFrame.
|
||||
"""
|
||||
pandas_df = pandas.concat(ray.get(df._row_partitions), copy=False)
|
||||
pandas_df.index = df.index
|
||||
pandas_df.columns = df.columns
|
||||
return pandas_df
|
||||
|
||||
|
||||
@ray.remote
|
||||
def extractor(df_chunk, row_loc, col_loc):
|
||||
"""Retrieve an item from remote block
|
||||
"""
|
||||
# We currently have to do the writable flag trick because a pandas bug
|
||||
# https://github.com/pandas-dev/pandas/issues/17192
|
||||
try:
|
||||
row_loc.flags.writeable = True
|
||||
col_loc.flags.writeable = True
|
||||
except AttributeError:
|
||||
# Locators might be scaler or python list
|
||||
pass
|
||||
return df_chunk.iloc[row_loc, col_loc]
|
||||
|
||||
|
||||
@ray.remote
|
||||
def writer(df_chunk, row_loc, col_loc, item):
|
||||
"""Make a copy of the block and write new item to it
|
||||
"""
|
||||
df_chunk = df_chunk.copy()
|
||||
df_chunk.iloc[row_loc, col_loc] = item
|
||||
return df_chunk
|
||||
|
||||
|
||||
def _mask_block_partitions(blk_partitions, row_metadata, col_metadata):
|
||||
"""Return the squeezed/expanded block partitions as defined by
|
||||
row_metadata and col_metadata.
|
||||
|
||||
Note:
|
||||
Very naive implementation. Extract one scaler at a time in a double
|
||||
for loop.
|
||||
"""
|
||||
col_df = col_metadata._coord_df
|
||||
row_df = row_metadata._coord_df
|
||||
|
||||
result_oids = []
|
||||
shape = (len(row_df.index), len(col_df.index))
|
||||
|
||||
for _, row_partition_data in row_df.iterrows():
|
||||
for _, col_partition_data in col_df.iterrows():
|
||||
row_part = row_partition_data.partition
|
||||
col_part = col_partition_data.partition
|
||||
block_oid = blk_partitions[row_part, col_part]
|
||||
|
||||
row_idx = row_partition_data['index_within_partition']
|
||||
col_idx = col_partition_data['index_within_partition']
|
||||
|
||||
result_oid = extractor.remote(block_oid, [row_idx], [col_idx])
|
||||
result_oids.append(result_oid)
|
||||
return np.array(result_oids).reshape(shape)
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _deploy_func(func, dataframe, *args):
|
||||
"""Deploys a function for the _map_partitions call.
|
||||
Args:
|
||||
dataframe (pandas.DataFrame): The pandas DataFrame for this partition.
|
||||
Returns:
|
||||
A futures object representing the return value of the function
|
||||
provided.
|
||||
"""
|
||||
if len(args) == 0:
|
||||
return func(dataframe)
|
||||
else:
|
||||
return func(dataframe, *args)
|
||||
|
||||
|
||||
def _map_partitions(func, partitions, *argslists):
|
||||
"""Apply a function across the specified axis
|
||||
|
||||
Args:
|
||||
func (callable): The function to apply
|
||||
partitions ([ObjectID]): The list of partitions to map func on.
|
||||
|
||||
Returns:
|
||||
A list of partitions ([ObjectID]) with the result of the function
|
||||
"""
|
||||
if partitions is None:
|
||||
return None
|
||||
|
||||
assert(callable(func))
|
||||
if len(argslists) == 0:
|
||||
return [_deploy_func.remote(func, part) for part in partitions]
|
||||
elif len(argslists) == 1:
|
||||
return [_deploy_func.remote(func, part, argslists[0])
|
||||
for part in partitions]
|
||||
else:
|
||||
assert(all(len(args) == len(partitions) for args in argslists))
|
||||
return [_deploy_func.remote(func, *args)
|
||||
for args in zip(partitions, *argslists)]
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _build_col_widths(df_col):
|
||||
"""Compute widths (# of columns) for each partition."""
|
||||
widths = np.array(ray.get([_deploy_func.remote(_get_widths, d)
|
||||
for d in df_col]))
|
||||
|
||||
return widths
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _build_row_lengths(df_row):
|
||||
"""Compute lengths (# of rows) for each partition."""
|
||||
lengths = np.array(ray.get([_deploy_func.remote(_get_lengths, d)
|
||||
for d in df_row]))
|
||||
|
||||
return lengths
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _build_coord_df(lengths, index):
|
||||
"""Build the coordinate dataframe over all partitions."""
|
||||
filtered_lengths = [x for x in lengths if x > 0]
|
||||
coords = None
|
||||
if len(filtered_lengths) > 0:
|
||||
coords = np.vstack([np.column_stack((np.full(l, i), np.arange(l)))
|
||||
for i, l in enumerate(filtered_lengths)])
|
||||
col_names = ("partition", "index_within_partition")
|
||||
return pandas.DataFrame(coords, index=index, columns=col_names)
|
||||
|
||||
|
||||
def _create_block_partitions(partitions, axis=0, length=None):
|
||||
|
||||
if length is not None and length != 0 and get_npartitions() > length:
|
||||
npartitions = length
|
||||
elif length == 0:
|
||||
npartitions = 1
|
||||
else:
|
||||
npartitions = get_npartitions()
|
||||
|
||||
x = [create_blocks._submit(args=(partition, npartitions, axis),
|
||||
num_return_vals=npartitions)
|
||||
for partition in partitions]
|
||||
|
||||
# In the case that axis is 1 we have to transpose because we build the
|
||||
# columns into rows. Fortunately numpy is efficient at this.
|
||||
blocks = np.array(x) if axis == 0 else np.array(x).T
|
||||
|
||||
# Sometimes we only get a single column or row, which is
|
||||
# problematic for building blocks from the partitions, so we
|
||||
# add whatever dimension we're missing from the input.
|
||||
return fix_blocks_dimensions(blocks, axis)
|
||||
|
||||
|
||||
@ray.remote
|
||||
def create_blocks(df, npartitions, axis):
|
||||
return create_blocks_helper(df, npartitions, axis)
|
||||
|
||||
|
||||
def create_blocks_helper(df, npartitions, axis):
|
||||
# Single partition dataframes don't need to be repartitioned
|
||||
if npartitions == 1:
|
||||
return df
|
||||
# In the case that the size is not a multiple of the number of partitions,
|
||||
# we need to add one to each partition to avoid losing data off the end
|
||||
block_size = df.shape[axis ^ 1] // npartitions \
|
||||
if df.shape[axis ^ 1] % npartitions == 0 \
|
||||
else df.shape[axis ^ 1] // npartitions + 1
|
||||
|
||||
# if not isinstance(df.columns, pandas.RangeIndex):
|
||||
# df.columns = pandas.RangeIndex(0, len(df.columns))
|
||||
|
||||
blocks = [df.iloc[:, i * block_size: (i + 1) * block_size]
|
||||
if axis == 0
|
||||
else df.iloc[i * block_size: (i + 1) * block_size, :]
|
||||
for i in range(npartitions)]
|
||||
|
||||
for block in blocks:
|
||||
block.columns = pandas.RangeIndex(0, len(block.columns))
|
||||
block.reset_index(inplace=True, drop=True)
|
||||
return blocks
|
||||
|
||||
|
||||
@memoize
|
||||
@ray.remote
|
||||
def _blocks_to_col(*partition):
|
||||
if len(partition):
|
||||
return pandas.concat(partition, axis=0, copy=False)\
|
||||
.reset_index(drop=True)
|
||||
else:
|
||||
return pandas.Series()
|
||||
|
||||
|
||||
@memoize
|
||||
@ray.remote
|
||||
def _blocks_to_row(*partition):
|
||||
row_part = pandas.concat(partition, axis=1, copy=False)\
|
||||
.reset_index(drop=True)
|
||||
# Because our block partitions contain different indices (for the
|
||||
# columns), this change is needed to ensure correctness.
|
||||
row_part.columns = pandas.RangeIndex(0, len(row_part.columns))
|
||||
return row_part
|
||||
|
||||
|
||||
def _inherit_docstrings(parent, excluded=[]):
|
||||
"""Creates a decorator which overwrites a decorated class' __doc__
|
||||
attribute with parent's __doc__ attribute. Also overwrites __doc__ of
|
||||
methods and properties defined in the class with the __doc__ of matching
|
||||
methods and properties in parent.
|
||||
|
||||
Args:
|
||||
parent (object): Class from which the decorated class inherits __doc__.
|
||||
excluded (list): List of parent objects from which the class does not
|
||||
inherit docstrings.
|
||||
|
||||
Returns:
|
||||
function: decorator which replaces the decorated class' documentation
|
||||
parent's documentation.
|
||||
"""
|
||||
def decorator(cls):
|
||||
if parent not in excluded:
|
||||
cls.__doc__ = parent.__doc__
|
||||
for attr, obj in cls.__dict__.items():
|
||||
parent_obj = getattr(parent, attr, None)
|
||||
if parent_obj in excluded or \
|
||||
(not callable(parent_obj) and
|
||||
not isinstance(parent_obj, property)):
|
||||
continue
|
||||
if callable(obj):
|
||||
obj.__doc__ = parent_obj.__doc__
|
||||
elif isinstance(obj, property) and obj.fget is not None:
|
||||
p = property(obj.fget, obj.fset, obj.fdel, parent_obj.__doc__)
|
||||
setattr(cls, attr, p)
|
||||
|
||||
return cls
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _reindex_helper(old_index, new_index, axis, npartitions, *df):
|
||||
"""Reindexes a dataframe to prepare for join/concat.
|
||||
|
||||
Args:
|
||||
df: The DataFrame partition
|
||||
old_index: The index/column for this partition.
|
||||
new_index: The new index/column to assign.
|
||||
axis: Which axis to reindex over.
|
||||
|
||||
Returns:
|
||||
A new set of blocks made up of DataFrames.
|
||||
"""
|
||||
df = pandas.concat(df, axis=axis ^ 1)
|
||||
if axis == 1:
|
||||
df.index = old_index
|
||||
elif axis == 0:
|
||||
df.columns = old_index
|
||||
|
||||
df = df.reindex(new_index, copy=False, axis=axis ^ 1)
|
||||
return create_blocks_helper(df, npartitions, axis)
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _co_op_helper(func, left_columns, right_columns, left_df_len, left_idx,
|
||||
*zipped):
|
||||
"""Copartition operation where two DataFrames must have aligned indexes.
|
||||
|
||||
NOTE: This function assumes things are already copartitioned. Requires that
|
||||
row partitions are passed in as blocks.
|
||||
|
||||
Args:
|
||||
func: The operation to conduct between two DataFrames.
|
||||
left_columns: The column names for the left DataFrame.
|
||||
right_columns: The column names for the right DataFrame.
|
||||
left_df_len: The length of the left. This is used so we can split up
|
||||
the zipped partitions.
|
||||
zipped: The DataFrame partitions (in blocks).
|
||||
|
||||
Returns:
|
||||
A new set of blocks for the partitioned DataFrame.
|
||||
"""
|
||||
left = pandas.concat(zipped[:left_df_len], axis=1, copy=False).copy()
|
||||
left.columns = left_columns
|
||||
if left_idx is not None:
|
||||
left.index = left_idx
|
||||
|
||||
right = pandas.concat(zipped[left_df_len:], axis=1, copy=False).copy()
|
||||
right.columns = right_columns
|
||||
|
||||
new_rows = func(left, right)
|
||||
|
||||
new_blocks = create_blocks_helper(new_rows, left_df_len, 0)
|
||||
|
||||
if left_idx is not None:
|
||||
new_blocks.append(new_rows.index)
|
||||
|
||||
return new_blocks
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _match_partitioning(column_partition, lengths, index):
|
||||
"""Match the number of rows on each partition. Used in df.merge().
|
||||
|
||||
NOTE: This function can cause problems when there are empty column
|
||||
partitions.
|
||||
|
||||
The way this function is intended to be used is as follows: Align the
|
||||
right partitioning with the left. The left will remain unchanged. Then,
|
||||
you are free to perform actions on a per-partition basis with the
|
||||
partitioning.
|
||||
|
||||
The index objects must already be identical for this to work correctly.
|
||||
|
||||
Args:
|
||||
column_partition: The column partition to change.
|
||||
lengths: The lengths of each row partition to match to.
|
||||
index: The index index of the column_partition. This is used to push
|
||||
down to the inner frame for correctness in the merge.
|
||||
|
||||
Returns:
|
||||
A list of blocks created from this column partition.
|
||||
"""
|
||||
partitioned_list = []
|
||||
|
||||
columns = column_partition.columns
|
||||
# We set this because this is the only place we can guarantee correct
|
||||
# placement. We use it in the case the user wants to join on the index.
|
||||
column_partition.index = index
|
||||
for length in lengths:
|
||||
if len(column_partition) == 0:
|
||||
partitioned_list.append(pandas.DataFrame(columns=columns))
|
||||
continue
|
||||
|
||||
partitioned_list.append(column_partition.iloc[:length, :])
|
||||
column_partition = column_partition.iloc[length:, :]
|
||||
return partitioned_list
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _concat_index(*index_parts):
|
||||
return index_parts[0].append(index_parts[1:])
|
||||
|
||||
|
||||
def fix_blocks_dimensions(blocks, axis):
|
||||
"""Checks that blocks is 2D, and adds a dimension if not.
|
||||
"""
|
||||
if blocks.ndim < 2:
|
||||
return np.expand_dims(blocks, axis=axis ^ 1)
|
||||
return blocks
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _compile_remote_dtypes(*column_of_blocks):
|
||||
small_dfs = [df.loc[0:0] for df in column_of_blocks]
|
||||
return pandas.concat(small_dfs, copy=False).dtypes
|
||||
Reference in New Issue
Block a user