Dataframe deprecation (#2353)

This commit is contained in:
Devin Petersohn
2018-07-06 00:16:22 -07:00
committed by Robert Nishihara
parent 4f1d14e176
commit 4185aaed10
22 changed files with 6 additions and 16270 deletions
+2 -60
View File
@@ -1,60 +1,2 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas
# TODO: In the future `set_option` or similar needs to run on every node
# in order to keep all pandas instances across nodes consistent
from pandas import (eval, unique, value_counts, cut, to_numeric, factorize,
test, qcut, match, Panel, date_range, Index, MultiIndex,
CategoricalIndex, Series, bdate_range, DatetimeIndex,
Timedelta, Timestamp, to_timedelta, set_eng_float_format,
set_option, NaT, PeriodIndex, Categorical)
import threading
pandas_version = pandas.__version__
pandas_major = int(pandas_version.split(".")[0])
pandas_minor = int(pandas_version.split(".")[1])
if pandas_major == 0 and pandas_minor != 22:
raise Exception("In order to use Pandas on Ray, your pandas version must "
"be 0.22. You can run 'pip install pandas==0.22'")
DEFAULT_NPARTITIONS = 8
def set_npartition_default(n):
global DEFAULT_NPARTITIONS
DEFAULT_NPARTITIONS = n
def get_npartitions():
return DEFAULT_NPARTITIONS
# We import these file after above two function
# because they depend on npartitions.
from .dataframe import DataFrame # noqa: 402
from .io import (read_csv, read_parquet, read_json, read_html, # noqa: 402
read_clipboard, read_excel, read_hdf, read_feather, # noqa: 402
read_msgpack, read_stata, read_sas, read_pickle, # noqa: 402
read_sql) # noqa: 402
from .concat import concat # noqa: 402
from .datetimes import to_datetime # noqa: 402
from .reshape import get_dummies # noqa: 402
__all__ = [
"DataFrame", "Series", "read_csv", "read_parquet", "concat", "eval",
"unique", "value_counts", "cut", "to_numeric", "factorize", "test", "qcut",
"match", "to_datetime", "get_dummies", "Panel", "date_range", "Index",
"MultiIndex", "Series", "bdate_range", "DatetimeIndex", "to_timedelta",
"set_eng_float_format", "set_option", "CategoricalIndex", "Timedelta",
"Timestamp", "NaT", "PeriodIndex", "Categorical"
]
try:
if threading.current_thread().name == "MainThread":
import ray
ray.init()
except AssertionError:
pass
raise DeprecationWarning("Pandas on Ray has moved to Modin: "
"github.com/modin-project/modin")
-133
View File
@@ -1,133 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas
import numpy as np
from .dataframe import DataFrame
from .utils import _reindex_helper
def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
keys=None, levels=None, names=None, verify_integrity=False,
copy=True):
if keys is not None:
objs = [objs[k] for k in keys]
else:
objs = list(objs)
if len(objs) == 0:
raise ValueError("No objects to concatenate")
objs = [obj for obj in objs if obj is not None]
if len(objs) == 0:
raise ValueError("All objects passed were None")
try:
type_check = next(obj for obj in objs
if not isinstance(obj, (pandas.Series,
pandas.DataFrame,
DataFrame)))
except StopIteration:
type_check = None
if type_check is not None:
raise ValueError("cannot concatenate object of type \"{0}\"; only "
"pandas.Series, pandas.DataFrame, "
"and ray.dataframe.DataFrame objs are "
"valid", type(type_check))
all_series = all(isinstance(obj, pandas.Series)
for obj in objs)
if all_series:
return DataFrame(pandas.concat(objs, axis, join, join_axes,
ignore_index, keys, levels, names,
verify_integrity, copy))
if isinstance(objs, dict):
raise NotImplementedError(
"Obj as dicts not implemented. To contribute to "
"Pandas on Ray, please visit github.com/ray-project/ray.")
axis = pandas.DataFrame()._get_axis_number(axis)
if join not in ['inner', 'outer']:
raise ValueError("Only can inner (intersect) or outer (union) join the"
" other axis")
# We need this in a list because we use it later.
all_index, all_columns = list(zip(*[(obj.index, obj.columns)
for obj in objs]))
def series_to_df(series, columns):
df = pandas.DataFrame(series)
df.columns = columns
return DataFrame(df)
# Pandas puts all of the Series in a single column named 0. This is
# true regardless of the existence of another column named 0 in the
# concat.
if axis == 0:
objs = [series_to_df(obj, [0])
if isinstance(obj, pandas.Series) else obj for obj in objs]
else:
# Pandas starts the count at 0 so this will increment the names as
# long as there's a new nameless Series being added.
def name_incrementer(i):
val = i[0]
i[0] += 1
return val
i = [0]
objs = [series_to_df(obj, obj.name if obj.name is not None
else name_incrementer(i))
if isinstance(obj, pandas.Series) else obj for obj in objs]
# Using concat on the columns and index is fast because they're empty,
# and it forces the error checking. It also puts the columns in the
# correct order for us.
final_index = \
pandas.concat([pandas.DataFrame(index=idx) for idx in all_index],
axis=axis, join=join, join_axes=join_axes,
ignore_index=ignore_index, keys=keys, levels=levels,
names=names, verify_integrity=verify_integrity,
copy=False).index
final_columns = \
pandas.concat([pandas.DataFrame(columns=col)
for col in all_columns],
axis=axis, join=join, join_axes=join_axes,
ignore_index=ignore_index, keys=keys, levels=levels,
names=names, verify_integrity=verify_integrity,
copy=False).columns
# Put all of the DataFrames into Ray format
# TODO just partition the DataFrames instead of building a new Ray DF.
objs = [DataFrame(obj) if isinstance(obj, (pandas.DataFrame,
pandas.Series)) else obj
for obj in objs]
# Here we reuse all_columns/index so we don't have to materialize objects
# from remote memory built in the previous line. In the future, we won't be
# building new DataFrames, rather just partitioning the DataFrames.
if axis == 0:
new_blocks = np.array([_reindex_helper._submit(
args=tuple([all_columns[i], final_columns, axis,
len(objs[0]._block_partitions)] + part.tolist()),
num_return_vals=len(objs[0]._block_partitions))
for i in range(len(objs))
for part in objs[i]._block_partitions])
else:
# Transposing the columns is necessary because the remote task treats
# everything like rows and returns in row-major format. Luckily, this
# operation is cheap in numpy.
new_blocks = np.array([_reindex_helper._submit(
args=tuple([all_index[i], final_index, axis,
len(objs[0]._block_partitions.T)] + part.tolist()),
num_return_vals=len(objs[0]._block_partitions.T))
for i in range(len(objs))
for part in objs[i]._block_partitions.T]).T
return DataFrame(block_partitions=new_blocks,
columns=final_columns,
index=final_index)
File diff suppressed because it is too large Load Diff
-64
View File
@@ -1,64 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas
import ray
from .dataframe import DataFrame
from .utils import _map_partitions
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None,
box=True, format=None, exact=True, unit=None,
infer_datetime_format=False, origin='unix'):
"""Convert the arg to datetime format. If not Ray DataFrame, this falls
back on pandas.
Args:
errors ('raise' or 'ignore'): If 'ignore', errors are silenced.
dayfirst (bool): Date format is passed in as day first.
yearfirst (bool): Date format is passed in as year first.
utc (bool): retuns a UTC DatetimeIndex if True.
box (bool): If True, returns a DatetimeIndex.
format (string): strftime to parse time, eg "%d/%m/%Y".
exact (bool): If True, require an exact format match.
unit (string, default 'ns'): unit of the arg.
infer_datetime_format (bool): Whether or not to infer the format.
origin (string): Define the reference date.
Returns:
Type depends on input:
- list-like: DatetimeIndex
- Series: Series of datetime64 dtype
- scalar: Timestamp
"""
if not isinstance(arg, DataFrame):
return pandas.to_datetime(arg, errors=errors, dayfirst=dayfirst,
yearfirst=yearfirst, utc=utc, box=box,
format=format, exact=exact, unit=unit,
infer_datetime_format=infer_datetime_format,
origin=origin)
if errors == 'raise':
pandas.to_datetime(pandas.DataFrame(columns=arg.columns),
errors=errors, dayfirst=dayfirst,
yearfirst=yearfirst, utc=utc, box=box,
format=format, exact=exact, unit=unit,
infer_datetime_format=infer_datetime_format,
origin=origin)
def datetime_helper(df, cols):
df.columns = cols
return pandas.to_datetime(df, errors=errors, dayfirst=dayfirst,
yearfirst=yearfirst, utc=utc, box=box,
format=format, exact=exact, unit=unit,
infer_datetime_format=infer_datetime_format,
origin=origin)
datetime_series = _map_partitions(datetime_helper, arg._row_partitions,
arg.columns)
result = pandas.concat(ray.get(datetime_series), copy=False)
result.index = arg.index
return result
-581
View File
@@ -1,581 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas
import numpy as np
import pandas.core.groupby
from pandas.core.dtypes.common import is_list_like
import pandas.core.common as com
import ray
from .utils import _inherit_docstrings, _reindex_helper
from .concat import concat
from .index_metadata import _IndexMetadata
@_inherit_docstrings(pandas.core.groupby.DataFrameGroupBy,
excluded=[pandas.core.groupby.DataFrameGroupBy,
pandas.core.groupby.DataFrameGroupBy.__init__])
class DataFrameGroupBy(object):
def __init__(self, df, by, axis, level, as_index, sort, group_keys,
squeeze, **kwargs):
self._columns = df.columns
self._index = df.index
self._axis = axis
self._row_metadata = df._row_metadata
self._col_metadata = df._col_metadata
if axis == 0:
partitions = [column for column in df._block_partitions.T]
self._index_grouped = \
pandas.Series(self._index, index=self._index) \
.groupby(by=by, sort=sort)
else:
partitions = [row for row in df._block_partitions]
self._index_grouped = \
pandas.Series(self._columns, index=self._columns) \
.groupby(by=by, sort=sort)
self._keys_and_values = [(k, v)
for k, v in self._index_grouped]
if len(self) > 1:
self._grouped_partitions = \
list(zip(*(groupby._submit(args=(by,
axis,
level,
as_index,
sort,
group_keys,
squeeze)
+ tuple(part.tolist()),
num_return_vals=len(self))
for part in partitions)))
else:
if axis == 0:
self._grouped_partitions = [df._col_partitions]
else:
self._grouped_partitions = [df._row_partitions]
def __getattr__(self, key):
"""Afer regular attribute access, looks up the name in the columns
Args:
key (str): Attribute name.
Returns:
The value of the attribute.
"""
try:
return object.__getattribute__(self, key)
except AttributeError as e:
if key in self._columns:
raise NotImplementedError(
"SeriesGroupBy is not implemented."
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
raise e
@property
def _iter(self):
from .dataframe import DataFrame
if self._axis == 0:
return [(self._keys_and_values[i][0],
DataFrame(col_partitions=part,
columns=self._columns,
index=self._keys_and_values[i][1].index,
col_metadata=self._col_metadata))
for i, part in enumerate(self._grouped_partitions)]
else:
return [(self._keys_and_values[i][0],
DataFrame(row_partitions=part,
columns=self._keys_and_values[i][1].index,
index=self._index,
row_metadata=self._row_metadata))
for i, part in enumerate(self._grouped_partitions)]
@property
def ngroups(self):
return len(self)
def skew(self, **kwargs):
return self._apply_agg_function(lambda df: df.skew(axis=self._axis,
**kwargs))
def ffill(self, limit=None):
return self._apply_df_function(lambda df: df.ffill(axis=self._axis,
limit=limit))
def sem(self, ddof=1):
return self._apply_agg_function(lambda df: df.sem(axis=self._axis,
ddof=ddof))
def mean(self, *args, **kwargs):
return self._apply_agg_function(lambda df: df.mean(axis=self._axis,
*args,
**kwargs))
def any(self):
return self._apply_agg_function(lambda df: df.any(axis=self._axis))
@property
def plot(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def ohlc(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def __bytes__(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
@property
def tshift(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
@property
def groups(self):
return {k: pandas.Index(v) for k, v in self._keys_and_values}
def min(self, **kwargs):
return self._apply_agg_function(lambda df: df.min(axis=self._axis,
**kwargs))
def idxmax(self):
def idxmax_helper(df, index):
result = df.idxmax(axis=self._axis)
result = result.apply(lambda v: index[v])
return result
results = [idxmax_helper(g[1], i[1])
for g, i in zip(self._iter, self._index_grouped)]
new_df = concat(results, axis=1)
if self._axis == 0:
new_df = new_df.T
new_df.columns = self._columns
new_df.index = [k for k, v in self._iter]
else:
new_df.columns = [k for k, v in self._iter]
new_df.index = self._index
return new_df
@property
def ndim(self):
return 2 # ndim is always 2 for DataFrames
def shift(self, periods=1, freq=None, axis=0):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def nth(self, n, dropna=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def cumsum(self, axis=0, *args, **kwargs):
return self._apply_df_function(lambda df: df.cumsum(axis,
*args,
**kwargs))
@property
def indices(self):
return dict(self._keys_and_values)
def pct_change(self):
return self._apply_agg_function(
lambda df: df.pct_change(axis=self._axis))
def filter(self, func, dropna=True, *args, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def cummax(self, axis=0, **kwargs):
return self._apply_df_function(lambda df: df.cummax(axis,
**kwargs))
def apply(self, func, *args, **kwargs):
def apply_helper(df):
return df.apply(func, axis=self._axis, *args, **kwargs)
result = [func(v) for k, v in self._iter]
if self._axis == 0:
if isinstance(result[0], pandas.Series):
# Applied an aggregation function
new_df = concat(result, axis=1).T
new_df.columns = self._columns
new_df.index = [k for k, v in self._iter]
else:
new_df = concat(result, axis=self._axis)
new_df._block_partitions = np.array([_reindex_helper._submit(
args=tuple([new_df.index, self._index, self._axis ^ 1,
len(new_df._block_partitions)]
+ block.tolist()),
num_return_vals=len(new_df._block_partitions))
for block in new_df._block_partitions.T]).T
new_df.index = self._index
new_df._row_metadata = \
_IndexMetadata(new_df._block_partitions[:, 0],
index=new_df.index, axis=0)
else:
if isinstance(result[0], pandas.Series):
# Applied an aggregation function
new_df = concat(result, axis=1)
new_df.columns = [k for k, v in self._iter]
new_df.index = self._index
else:
new_df = concat(result, axis=self._axis)
new_df._block_partitions = np.array([_reindex_helper._submit(
args=tuple([new_df.columns, self._columns, self._axis ^ 1,
new_df._block_partitions.shape[1]]
+ block.tolist()),
num_return_vals=new_df._block_partitions.shape[1])
for block in new_df._block_partitions])
new_df.columns = self._columns
new_df._col_metadata = \
_IndexMetadata(new_df._block_partitions[0, :],
index=new_df.columns, axis=1)
return new_df
@property
def dtypes(self):
if self._axis == 1:
raise ValueError("Cannot call dtypes on groupby with axis=1")
return self._apply_agg_function(lambda df: df.dtypes)
def first(self, **kwargs):
return self._apply_agg_function(lambda df: df.first(offset=0,
**kwargs))
def backfill(self, limit=None):
return self.bfill(limit)
def __getitem__(self, key):
# This operation requires a SeriesGroupBy Object
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def cummin(self, axis=0, **kwargs):
return self._apply_df_function(lambda df: df.cummin(axis=axis,
**kwargs))
def bfill(self, limit=None):
return self._apply_df_function(lambda df: df.bfill(axis=self._axis,
limit=limit))
def idxmin(self):
def idxmin_helper(df, index):
result = df.idxmin(axis=self._axis)
result = result.apply(lambda v: index[v])
return result
results = [idxmin_helper(g[1], i[1])
for g, i in zip(self._iter, self._index_grouped)]
new_df = concat(results, axis=1)
if self._axis == 0:
new_df = new_df.T
new_df.columns = self._columns
new_df.index = [k for k, v in self._iter]
else:
new_df.columns = [k for k, v in self._iter]
new_df.index = self._index
return new_df
def prod(self, **kwargs):
return self._apply_agg_function(lambda df: df.prod(axis=self._axis,
**kwargs))
def std(self, ddof=1, *args, **kwargs):
return self._apply_agg_function(lambda df: df.std(axis=self._axis,
ddof=ddof,
*args,
**kwargs))
def aggregate(self, arg, *args, **kwargs):
if self._axis != 0:
# This is not implemented in pandas,
# so we throw a different message
raise NotImplementedError("axis other than 0 is not supported")
if is_list_like(arg):
raise NotImplementedError(
"This requires Multi-level index to be implemented. "
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
return self._apply_agg_function(lambda df: df.agg(arg,
axis=self._axis,
*args,
**kwargs))
def last(self, **kwargs):
return self._apply_df_function(lambda df: df.last(offset=0,
**kwargs))
def mad(self):
return self._apply_agg_function(lambda df: df.mad())
def rank(self):
return self._apply_df_function(lambda df: df.rank(axis=self._axis))
@property
def corrwith(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def pad(self, limit=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def max(self, **kwargs):
return self._apply_agg_function(lambda df: df.max(axis=self._axis,
**kwargs))
def var(self, ddof=1, *args, **kwargs):
return self._apply_agg_function(lambda df: df.var(ddof=ddof,
axis=self._axis,
*args,
**kwargs))
def get_group(self, name, obj=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def __len__(self):
return len(self._keys_and_values)
def all(self):
return self._apply_agg_function(lambda df: df.all())
def size(self):
return self._apply_agg_function(lambda df: df.size)
def sum(self, **kwargs):
return self._apply_agg_function(lambda df:
df.sum(axis=self._axis, **kwargs))
def __unicode__(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def describe(self, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def boxplot(self, grouped, subplots=True, column=None, fontsize=None,
rot=0, grid=True, ax=None, figsize=None, layout=None, **kwds):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def ngroup(self, ascending=True):
return self._index_grouped.ngroup(ascending)
def nunique(self, dropna=True):
return self._apply_agg_function(lambda df: df.nunique(dropna=dropna,
axis=self._axis))
def resample(self, rule, *args, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def median(self, **kwargs):
return self._apply_agg_function(lambda df: df.median(axis=self._axis,
**kwargs))
def head(self, n=5):
result = [v.head(n) for k, v in self._iter]
new_df = concat(result, axis=self._axis)
if self._axis == 0:
index_head = [v[:n] for k, v in self._keys_and_values]
flattened_index = {i for j in index_head for i in j}
sorted_index = [i for i in self._index if i in flattened_index]
new_df._block_partitions = np.array([_reindex_helper._submit(
args=tuple([new_df.index, sorted_index, 1,
len(new_df._block_partitions)] + block.tolist()),
num_return_vals=len(new_df._block_partitions))
for block in new_df._block_partitions.T]).T
new_df.index = sorted_index
new_df._row_metadata = \
_IndexMetadata(new_df._block_partitions[:, 0],
index=new_df.index, axis=0)
return new_df
def cumprod(self, axis=0, *args, **kwargs):
return self._apply_df_function(lambda df: df.cumprod(axis,
*args,
**kwargs))
def __iter__(self):
return self._iter.__iter__()
def agg(self, arg, *args, **kwargs):
return self.aggregate(arg, *args, **kwargs)
def cov(self):
return self._apply_agg_function(lambda df: df.cov())
def transform(self, func, *args, **kwargs):
return self._apply_df_function(lambda df: df.transform(func,
*args,
**kwargs))
def corr(self, **kwargs):
return self._apply_agg_function(lambda df: df.corr(**kwargs))
def fillna(self, **kwargs):
return self._apply_df_function(lambda df: df.fillna(axis=self._axis,
**kwargs))
def count(self, **kwargs):
return self._apply_agg_function(lambda df: df.count(self._axis,
**kwargs))
def pipe(self, func, *args, **kwargs):
return com._pipe(self, func, *args, **kwargs)
def cumcount(self, ascending=True):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def tail(self, n=5):
result = [v.tail(n) for k, v in self._iter]
new_df = concat(result, axis=self._axis)
if self._axis == 0:
index_tail = [v[-n:] for k, v in self._keys_and_values]
flattened_index = {i for j in index_tail for i in j}
sorted_index = [i for i in self._index if i in flattened_index]
new_df._block_partitions = np.array([_reindex_helper._submit(
args=tuple([new_df.index, sorted_index, 1,
len(new_df._block_partitions)] + block.tolist()),
num_return_vals=len(new_df._block_partitions))
for block in new_df._block_partitions.T]).T
new_df.index = sorted_index
new_df._row_metadata = \
_IndexMetadata(new_df._block_partitions[:, 0],
index=new_df.index, axis=0)
return new_df
# expanding and rolling are unique cases and need to likely be handled
# separately. They do not appear to be commonly used.
def expanding(self, *args, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def rolling(self, *args, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def hist(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def quantile(self, q=0.5, **kwargs):
if is_list_like(q):
raise NotImplementedError(
"This requires Multi-level index to be implemented. "
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
return self._apply_agg_function(lambda df: df.quantile(q=q,
axis=self._axis,
**kwargs))
def diff(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
def take(self, **kwargs):
return self._apply_df_function(lambda df: df.take(**kwargs))
def _apply_agg_function(self, f):
assert callable(f), "\'{0}\' object is not callable".format(type(f))
result = [f(v) for k, v in self._iter]
new_df = concat(result, axis=1)
if self._axis == 0:
new_df = new_df.T
new_df.columns = self._columns
new_df.index = [k for k, v in self._iter]
else:
new_df.columns = [k for k, v in self._iter]
new_df.index = self._index
return new_df
def _apply_df_function(self, f, concat_axis=None):
assert callable(f), "\'{0}\' object is not callable".format(type(f))
result = [f(v) for k, v in self._iter]
concat_axis = self._axis if concat_axis is None else concat_axis
new_df = concat(result, axis=concat_axis)
if self._axis == 0:
new_df._block_partitions = np.array([_reindex_helper._submit(
args=tuple([new_df.index, self._index, 1,
len(new_df._block_partitions)] + block.tolist()),
num_return_vals=len(new_df._block_partitions))
for block in new_df._block_partitions.T]).T
new_df.index = self._index
new_df._row_metadata = \
_IndexMetadata(new_df._block_partitions[:, 0],
index=new_df.index, axis=0)
else:
new_df._block_partitions = np.array([_reindex_helper._submit(
args=tuple([new_df.columns, self._columns, 0,
new_df._block_partitions.shape[1]]
+ block.tolist()),
num_return_vals=new_df._block_partitions.shape[1])
for block in new_df._block_partitions])
new_df.columns = self._columns
new_df._col_metadata = \
_IndexMetadata(new_df._block_partitions[0, :],
index=new_df.columns, axis=1)
return new_df
@ray.remote
def groupby(by, axis, level, as_index, sort, group_keys, squeeze, *df):
df = pandas.concat(df, axis=axis)
return [v for k, v in df.groupby(by=by,
axis=axis,
level=level,
as_index=as_index,
sort=sort,
group_keys=group_keys,
squeeze=squeeze)]
-415
View File
@@ -1,415 +0,0 @@
import pandas
import numpy as np
import ray
from .utils import (
_build_row_lengths,
_build_col_widths,
_build_coord_df)
from pandas.core.indexing import convert_to_index_sliceable
class _IndexMetadata(object):
"""Wrapper for Pandas indexes in Ray DataFrames. Handles all of the
metadata specific to the axis of partition (setting indexes,
calculating the index within partition of a value, etc.). This
implementation assumes the underlying index lies across multiple
partitions.
IMPORTANT NOTE: Currently all operations, as implemented, are inplace.
WARNING: Currently, the `_lengths` item is the source of truth for an
_IndexMetadata object, since it is easy to manage, and that the coord_df
item may be deprecated in the future. As such, it is _very_ important that
any functions that mutate the coord_df splits in anyway first modify the
lengths. Otherwise bad things might happen!
"""
def __init__(self, dfs=None, index=None, axis=0, lengths_oid=None,
coord_df_oid=None):
"""Inits a IndexMetadata from Ray DataFrame partitions
Args:
dfs ([ObjectID]): ObjectIDs of dataframe partitions
index (pandas.Index): Index of the Ray DataFrame.
axis: Axis of partition (0=row partitions, 1=column partitions)
Returns:
A IndexMetadata backed by the specified pandas.Index, partitioned
off specified partitions
"""
assert (lengths_oid is None) == (coord_df_oid is None), \
"Must pass both or neither of lengths_oid and coord_df_oid"
if dfs is not None and lengths_oid is None:
if axis == 0:
lengths_oid = _build_row_lengths.remote(dfs)
else:
lengths_oid = _build_col_widths.remote(dfs)
coord_df_oid = _build_coord_df.remote(lengths_oid, index)
self._lengths = lengths_oid
self._coord_df = coord_df_oid
self._index_cache = index
self._cached_index = False
def _get__lengths(self):
if isinstance(self._lengths_cache, ray.ObjectID) or \
(isinstance(self._lengths_cache, list) and
isinstance(self._lengths_cache[0], ray.ObjectID)):
self._lengths_cache = ray.get(self._lengths_cache)
return self._lengths_cache
def _set__lengths(self, lengths):
self._lengths_cache = lengths
_lengths = property(_get__lengths, _set__lengths)
def _get__coord_df(self):
"""Get the coordinate dataframe wrapped by this _IndexMetadata.
Since we may have had an index set before our coord_df was
materialized, we'll have to apply it to the newly materialized df
"""
if isinstance(self._coord_df_cache, ray.ObjectID):
self._coord_df_cache = ray.get(self._coord_df_cache)
if self._cached_index:
self._coord_df_cache.index = self._index_cache
self._cached_index = False
return self._coord_df_cache
def _set__coord_df(self, coord_df):
"""Set the coordinate dataframe wrapped by this _IndexMetadata.
Sometimes we set the _IndexMetadata's coord_df outside of the
constructor, generally using fxns like drop(). This produces a modified
index, so we need to reflect the change on the index cache.
If the set _IndexMetadata is an OID instead (due to a copy or whatever
reason), we fall back relying on `_index_cache`.
"""
if not isinstance(coord_df, ray.ObjectID):
self._index_cache = coord_df.index
self._coord_df_cache = coord_df
_coord_df = property(_get__coord_df, _set__coord_df)
def _get_index(self):
"""Get the index wrapped by this _IndexMetadata.
The only time `self._index_cache` would be None is in a newly created
_IndexMetadata object without a specified `index` parameter (See the
_IndexMetadata constructor for more details)
"""
if isinstance(self._coord_df_cache, ray.ObjectID):
return self._index_cache
else:
return self._coord_df_cache.index
def _set_index(self, new_index):
"""Set the index wrapped by this _IndexMetadata.
It is important to always set `_index_cache` even if the coord_df is
materialized due to the possibility that it is set to an OID later on.
This design is more straightforward than caching indexes on setting the
coord_df to an OID due to the possibility of an OID-to-OID change.
"""
new_index = pandas.DataFrame(index=new_index).index
assert len(new_index) == len(self)
self._index_cache = new_index
if isinstance(self._coord_df_cache, ray.ObjectID):
self._cached_index = True
else:
self._coord_df_cache.index = new_index
index = property(_get_index, _set_index)
def _get_index_cache(self):
"""Get the cached Index object, which may sometimes be an OID.
This will ray.get the Index object out of the Ray store lazily, such
that it is not grabbed until it is needed in the driver. This layer of
abstraction is important for allowing this object to be instantiated
with a remote Index object.
Returns:
The Index object in _index_cache.
"""
if self._index_cache_validator is None:
self._index_cache_validator = pandas.RangeIndex(len(self))
elif isinstance(self._index_cache_validator,
ray.ObjectID):
self._index_cache_validator = ray.get(self._index_cache_validator)
return self._index_cache_validator
def _set_index_cache(self, new_index):
"""Sets the new index cache.
Args:
new_index: The Index to set the _index_cache to.
"""
self._index_cache_validator = new_index
# _index_cache_validator is an extra layer of abstraction to allow the
# cache to accept ObjectIDs and ray.get them when needed.
_index_cache = property(_get_index_cache, _set_index_cache)
def coords_of(self, key):
"""Returns the coordinates (partition, index_within_partition) of the
provided key in the index. Can be called on its own or implicitly
through __getitem__
Args:
key:
item to get coordinates of. Can also be a tuple of item
and {"partition", "index_within_partition"} if caller only
needs one of the coordinates
Returns:
Pandas object with the keys specified. If key is a single object
it will be a pandas.Series with items `partition` and
`index_within_partition`, and if key is a slice or if the key is
duplicate it will be a pandas.DataFrame with said items as columns.
"""
return self._coord_df.loc[key]
def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
group_keys=True, squeeze=False, **kwargs):
# TODO: Find out what this does, and write a docstring
assignments_df = self._coord_df.groupby(by=by, axis=axis, level=level,
as_index=as_index, sort=sort,
group_keys=group_keys,
squeeze=squeeze, **kwargs)\
.apply(lambda x: x[:])
return assignments_df
def partition_series(self, partition):
return self[self._coord_df['partition'] == partition,
'index_within_partition']
def __len__(self):
return int(sum(self._lengths))
def reset_partition_coords(self, partitions=None):
partitions = np.array(partitions)
for partition in partitions:
partition_mask = (self._coord_df['partition'] == partition)
# Since we are replacing columns with RangeIndex inside the
# partition, we have to make sure that our reference to it is
# upandasated as well.
try:
self._coord_df.loc[partition_mask,
'index_within_partition'] = np.arange(
sum(partition_mask)).astype(int)
except ValueError:
# Copy the arrow sealed dataframe so we can mutate it.
# We only do this the first time we try to mutate the sealed.
self._coord_df = self._coord_df.copy()
self._coord_df.loc[partition_mask,
'index_within_partition'] = np.arange(
sum(partition_mask)).astype(int)
def insert(self, key, loc=None, partition=None,
index_within_partition=None):
"""Inserts a key at a certain location in the index, or a certain coord
in a partition. Called with either `loc` or `partition` and
`index_within_partition`. If called with both, `loc` will be used.
Args:
key: item to insert into index
loc: location to insert into index
partition: partition to insert into
index_within_partition: index within partition to insert into
Returns:
DataFrame with coordinates of insert
"""
# Perform insert on a specific partition
# Determine which partition to place it in, and where in that partition
if loc is not None:
cum_lens = np.cumsum(self._lengths)
if len(cum_lens) > 1:
partition = np.digitize(loc, cum_lens[:-1], right=True)
else:
partition = 0
if partition >= len(cum_lens):
if loc > cum_lens[-1]:
raise IndexError("index {0} is out of bounds".format(loc))
else:
index_within_partition = self._lengths[-1]
else:
first_in_partition = \
np.asscalar(np.concatenate(([0], cum_lens))[partition])
index_within_partition = loc - first_in_partition
# TODO: Stop-gap solution until we begin passing IndexMetadatas
return partition, index_within_partition
# Generate new index
new_index = self.index.insert(loc, key)
# Shift indices in partition where we inserted column
idx_locs = (self._coord_df.partition == partition) & \
(self._coord_df.index_within_partition ==
index_within_partition)
# TODO: Determine why self._coord_df{,_cache} are read-only
_coord_df_copy = self._coord_df.copy()
_coord_df_copy.loc[idx_locs, 'index_within_partition'] += 1
# TODO: Determine if there's a better way to do a row-index insert in
# pandas, because this is very annoying/unsure of efficiency
# Create new coord entry to insert
coord_to_insert = pandas.DataFrame(
{'partition': partition,
'index_within_partition': index_within_partition},
index=[key])
# Insert into cached RangeIndex, and order by new column index
self._coord_df = _coord_df_copy.append(coord_to_insert).loc[new_index]
# Return inserted coordinate for callee
return coord_to_insert
def get_global_indices(self, partition, index_within_partition_list):
total = 0
for i in range(partition):
total += self._lengths[i]
return [total + i for i in index_within_partition_list]
def squeeze(self, partition, index_within_partition):
"""Prepare a single coordinate for removal by "squeezing" the
subsequent coordinates "up" one index within that partition. To be used
with "_IndexMetadata.drop" for when all the "squeezed" coordinates are
dropped in batch. Note that this function doesn't actually mutate the
coord_df.
"""
self._coord_df = self._coord_df.copy()
partition_mask = self._coord_df.partition == partition
index_within_partition_mask = \
self._coord_df.index_within_partition > index_within_partition
self._coord_df.loc[partition_mask & index_within_partition_mask,
'index_within_partition'] -= 1
def copy(self):
# TODO: Investigate copy-on-write wrapper for metadata objects
coord_df_copy = self._coord_df_cache
if not isinstance(self._coord_df_cache, ray.ObjectID):
coord_df_copy = self._coord_df_cache.copy()
lengths_copy = self._lengths_cache
if not isinstance(self._lengths_cache, ray.ObjectID):
lengths_copy = self._lengths_cache.copy()
index_copy = self._index_cache
if self._index_cache is not None:
index_copy = self._index_cache.copy()
return _IndexMetadata(index=index_copy,
coord_df_oid=coord_df_copy,
lengths_oid=lengths_copy)
def __getitem__(self, key):
"""Returns the coordinates (partition, index_within_partition) of the
provided key in the index. Essentially just an alias for
`_IndexMetadata.coords_of` that allows for slice passing, since
slices cannot be passed with slice notation other than through
`__getitem__` calls.
Args:
key:
item to get coordinates of. Can also be a tuple of item
and {"partition", "index_within_partition"} if caller only
needs one of the coordinates
Returns:
Pandas object with the keys specified. If key is a single object
it will be a pandas.Series with items `partition` and
`index_within_partition`, and if key is a slice or if the key is
duplicate it will be a pandas.DataFrame with said items as columns.
"""
return self.coords_of(key)
def first_valid_index(self):
return self._coord_df.first_valid_index()
def last_valid_index(self):
return self._coord_df.last_valid_index()
def drop(self, labels, errors='raise'):
"""Drop the specified labels from the IndexMetadata
Args:
labels (scalar or list-like):
The labels to drop
errors ('raise' or 'ignore'):
If 'ignore', suppress errors for when labels don't exist
Returns:
DataFrame with coordinates of dropped labels
"""
dropped = self.coords_of(labels)
# Upandasate first lengths to prevent possible length inconsistencies
if isinstance(dropped, pandas.DataFrame):
try:
drop_per_part = dropped.groupby(["partition"]).size()\
.reindex(index=pandas.RangeIndex(len(self._lengths)),
fill_value=0)
except ValueError:
# Copy the arrow sealed dataframe so we can mutate it.
dropped = dropped.copy()
drop_per_part = dropped.groupby(["partition"]).size()\
.reindex(index=pandas.RangeIndex(len(self._lengths)),
fill_value=0)
elif isinstance(dropped, pandas.Series):
drop_per_part = np.zeros_like(self._lengths)
drop_per_part[dropped["partition"]] = 1
else:
raise AssertionError("Unrecognized result from `coords_of`")
self._lengths = self._lengths - np.array(drop_per_part)
new_coord_df = self._coord_df.drop(labels, errors=errors)
num_dropped = 0
for i, length in enumerate(self._lengths):
if length == 0:
num_dropped += 1
if num_dropped > 0:
new_coord_df['partition'][new_coord_df['partition'] == i] \
-= num_dropped
self._coord_df = new_coord_df
return dropped
def rename_index(self, mapper):
"""Rename the index.
Args:
mapper: name to rename the index as
"""
self._coord_df = self._coord_df.rename_axis(mapper, axis=0)
def convert_to_index_sliceable(self, key):
"""Converts and performs error checking on the passed slice
Args:
key: slice to convert and check
"""
return convert_to_index_sliceable(self._coord_df, key)
def get_partition(self, partition_id):
"""Return a view of coord_df where partition = partition_id
"""
return self._coord_df[self._coord_df.partition == partition_id]
def sorted_index(self):
return (self._coord_df
.sort_values(['partition', 'index_within_partition'])
.index)
-469
View File
@@ -1,469 +0,0 @@
"""Indexing Helper Class works as follows:
_Location_Indexer_Base provide methods framework for __getitem__
and __setitem__ that work with Ray DataFrame's internal index. Base
class's __{get,set}item__ takes in partitions & idx_in_partition data
and perform lookup/item write.
_LocIndexer and _iLocIndexer is responsible for indexer specific logic and
lookup computation. Loc will take care of enlarge dataframe. Both indexer
will take care of translating pandas's lookup to Ray DataFrame's internal
lookup.
An illustration is available at
https://github.com/ray-project/ray/pull/1955#issuecomment-386781826
"""
import pandas
import numpy as np
import ray
from warnings import warn
from pandas.api.types import (is_scalar, is_list_like, is_bool)
from pandas.core.dtypes.common import is_integer
from pandas.core.indexing import IndexingError
from .utils import (_blocks_to_col, _get_nan_block_id, extractor,
_mask_block_partitions, writer)
from .index_metadata import _IndexMetadata
from .dataframe import DataFrame
def is_slice(x): return isinstance(x, slice)
def is_2d(x): return is_list_like(x) or is_slice(x)
def is_tuple(x): return isinstance(x, tuple)
def is_boolean_array(x): return is_list_like(x) and all(map(is_bool, x))
def is_integer_slice(x):
if not is_slice(x):
return False
for pos in [x.start, x.stop, x.step]:
if not ((pos is None) or is_integer(pos)):
return False # one position is neither None nor int
return True
_ENLARGEMENT_WARNING = """
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
"""
_ILOC_INT_ONLY_ERROR = """
Location based indexing can only have [integer, integer slice (START point is
INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types.
"""
def _parse_tuple(tup):
"""Unpack the user input for getitem and setitem and compute ndim
loc[a] -> ([a], :), 1D
loc[[a,b],] -> ([a,b], :),
loc[a,b] -> ([a], [b]), 0D
"""
row_loc, col_loc = slice(None), slice(None)
if is_tuple(tup):
row_loc = tup[0]
if len(tup) == 2:
col_loc = tup[1]
if len(tup) > 2:
raise IndexingError('Too many indexers')
else:
row_loc = tup
ndim = _compute_ndim(row_loc, col_loc)
row_loc = [row_loc] if is_scalar(row_loc) else row_loc
col_loc = [col_loc] if is_scalar(col_loc) else col_loc
return row_loc, col_loc, ndim
def _is_enlargement(locator, coord_df):
"""Determine if a locator will enlarge the corrd_df.
Enlargement happens when you trying to locate using labels isn't in the
original index. In other words, enlargement == adding NaNs !
"""
if is_list_like(locator) and not is_slice(
locator) and len(locator) > 0 and not is_boolean_array(locator):
n_diff_elems = len(pandas.Index(locator).difference(coord_df.index))
is_enlargement_boolean = n_diff_elems > 0
return is_enlargement_boolean
return False
def _warn_enlargement():
warn(FutureWarning(_ENLARGEMENT_WARNING))
def _compute_ndim(row_loc, col_loc):
"""Compute the ndim of result from locators
"""
row_scaler = is_scalar(row_loc)
col_scaler = is_scalar(col_loc)
if row_scaler and col_scaler:
ndim = 0
elif row_scaler ^ col_scaler:
ndim = 1
else:
ndim = 2
return ndim
class _Location_Indexer_Base():
"""Base class for location indexer like loc and iloc
"""
def __init__(self, ray_df):
self.df = ray_df
self.col_coord_df = ray_df._col_metadata._coord_df
self.row_coord_df = ray_df._row_metadata._coord_df
self.block_oids = ray_df._block_partitions
self.is_view = False
if isinstance(ray_df, DataFrameView):
self.block_oids = ray_df._block_partitions_data
self.is_view = True
def __getitem__(self, row_lookup, col_lookup, ndim):
"""
Args:
row_lookup: A pandas dataframe, a partial view from row_coord_df
col_lookup: A pandas dataframe, a partial view from col_coord_df
ndim: the dimension of returned data
"""
if ndim == 2:
return self._generate_view(row_lookup, col_lookup)
extracted = self._retrive_items(row_lookup, col_lookup)
if ndim == 1:
result = ray.get(_blocks_to_col.remote(*extracted)).squeeze()
if is_scalar(result):
result = pandas.Series(result)
scaler_axis = row_lookup if len(row_lookup) == 1 else col_lookup
series_name = scaler_axis.iloc[0].name
result.name = series_name
index_axis = row_lookup if len(col_lookup) == 1 else col_lookup
result.index = index_axis.index
if ndim == 0:
result = ray.get(extracted[0]).squeeze()
return result
def _retrive_items(self, row_lookup, col_lookup):
"""Given lookup dataframes, return a list of result oids
"""
result_oids = []
# We have to copy before we groupby because
# https://github.com/pandas-dev/pandas/issues/10043
row_groups = row_lookup.copy().groupby('partition')
col_groups = col_lookup.copy().groupby('partition')
for row_blk, row_data in row_groups:
for col_blk, col_data in col_groups:
block_oid = self.block_oids[row_blk, col_blk]
row_idx = row_data['index_within_partition']
col_idx = col_data['index_within_partition']
result_oid = extractor.remote(block_oid, row_idx, col_idx)
result_oids.append(result_oid)
return result_oids
def _generate_view(self, row_lookup, col_lookup):
"""Generate a DataFrameView from lookup
"""
row_lengths = [0] * len(self.df._row_metadata._lengths)
for i in row_lookup["partition"]:
row_lengths[i] += 1
col_lengths = [0] * len(self.df._col_metadata._lengths)
for i in col_lookup["partition"]:
col_lengths[i] += 1
row_metadata_view = _IndexMetadata(
coord_df_oid=row_lookup, lengths_oid=row_lengths)
col_metadata_view = _IndexMetadata(
coord_df_oid=col_lookup, lengths_oid=col_lengths)
df_view = DataFrameView(
block_partitions=self.block_oids,
row_metadata=row_metadata_view,
col_metadata=col_metadata_view,
index=row_metadata_view.index,
columns=col_metadata_view.index)
return df_view
def __setitem__(self, row_lookup, col_lookup, item):
"""
Args:
row_lookup: A pandas dataframe, a partial view from row_coord_df
col_lookup: A pandas dataframe, a partial view from col_coord_df
item: The new item needs to be set. It can be any shape that's
broadcastable to the product of the lookup tables.
"""
to_shape = (len(row_lookup), len(col_lookup))
item = self._broadcast_item(item, to_shape)
self._write_items(row_lookup, col_lookup, item)
def _broadcast_item(self, item, to_shape):
"""Use numpy to broadcast or reshape item.
Notes:
- Numpy is memory efficent, there shouldn't be performance issue.
"""
try:
item = np.array(item)
if np.prod(to_shape) == np.prod(item.shape):
return item.reshape(to_shape)
else:
return np.broadcast_to(item, to_shape)
except ValueError:
from_shape = np.array(item).shape
raise ValueError(
"could not broadcast input array from \
shape {from_shape} into shape {to_shape}".format(
from_shape=from_shape, to_shape=to_shape))
def _write_items(self, row_lookup, col_lookup, item):
"""Perform remote write and replace blocks.
"""
# We have to copy before we groupby because
# https://github.com/pandas-dev/pandas/issues/10043
row_groups = row_lookup.copy().groupby('partition')
col_groups = col_lookup.copy().groupby('partition')
row_item_index = 0
for row_blk, row_data in row_groups:
row_len = len(row_data)
col_item_index = 0
for col_blk, col_data in col_groups:
col_len = len(col_data)
block_oid = self.block_oids[row_blk, col_blk]
row_idx = row_data['index_within_partition']
col_idx = col_data['index_within_partition']
item_to_write = item[row_item_index:row_item_index + row_len,
col_item_index:col_item_index + col_len]
result_oid = writer.remote(block_oid, row_idx, col_idx,
item_to_write)
if self.is_view:
self.df._block_partitions_data[row_blk,
col_blk] = result_oid
else:
self.df._block_partitions[row_blk, col_blk] = result_oid
col_item_index += col_len
row_item_index += row_len
class _Loc_Indexer(_Location_Indexer_Base):
"""A indexer for ray_df.loc[] functionality"""
def __getitem__(self, key):
row_loc, col_loc, ndim = _parse_tuple(key)
self._handle_enlargement(row_loc, col_loc)
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
ndim = self._expand_dim(row_lookup, col_lookup, ndim)
result = super(_Loc_Indexer, self).__getitem__(row_lookup, col_lookup,
ndim)
return result
def __setitem__(self, key, item):
row_loc, col_loc, _ = _parse_tuple(key)
self._handle_enlargement(row_loc, col_loc)
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
super(_Loc_Indexer, self).__setitem__(row_lookup, col_lookup,
item)
def _handle_enlargement(self, row_loc, col_loc):
"""Handle Enlargement (if there is one).
Returns:
None
"""
locators = [row_loc, col_loc]
coord_dfs = [self.row_coord_df, self.col_coord_df]
axis = ['row', 'col']
metadata = {'row': self.df._row_metadata, 'col': self.df._col_metadata}
for loc, coord, axis in zip(locators, coord_dfs, axis):
if _is_enlargement(loc, coord):
new_meta = self._enlarge_axis(loc, axis=axis)
_warn_enlargement()
metadata[axis] = new_meta
self.row_coord_df = metadata['row']._coord_df
self.col_coord_df = metadata['col']._coord_df
def _enlarge_axis(self, locator, axis):
"""Add rows/columns to block partitions according to locator.
Returns:
metadata (_IndexMetadata)
"""
# 1. Prepare variables
row_based_bool = axis == 'row'
# major == the axis of the locator
major_meta = self.df._row_metadata if row_based_bool \
else self.df._col_metadata
minor_meta = self.df._col_metadata if row_based_bool \
else self.df._row_metadata
# 2. Compute the nan labels and add blocks
nan_labels = self._compute_enlarge_labels(locator, major_meta.index)
num_nan_labels = len(nan_labels)
blk_part_n_row, blk_part_n_col = self.block_oids.shape
nan_blk_lens = minor_meta._lengths
nan_blks = np.array([[
_get_nan_block_id(
num_nan_labels, n_cols, transpose=not row_based_bool)
for n_cols in nan_blk_lens
]])
nan_blks = nan_blks.T if not row_based_bool else nan_blks
self.block_oids = np.concatenate(
[self.block_oids, nan_blks], axis=0 if row_based_bool else 1)
# 3. Prepare metadata to return
nan_coord_df = pandas.DataFrame(data=[{
'': name,
'partition': blk_part_n_row if row_based_bool else blk_part_n_col,
'index_within_partition': i
} for name, i in zip(nan_labels, np.arange(num_nan_labels))
]).set_index('')
coord_df = pandas.concat([major_meta._coord_df, nan_coord_df])
coord_df = coord_df.loc[locator] # Re-index that allows duplicates
lens = major_meta._lengths
lens = np.concatenate([lens, np.array([num_nan_labels])])
metadata_view = _IndexMetadata(coord_df_oid=coord_df, lengths_oid=lens)
return metadata_view
def _compute_enlarge_labels(self, locator, base_index):
"""Helper for _enlarge_axis, compute common labels and extra labels.
Returns:
nan_labels: The labels needs to be added
"""
# base_index_type can be pd.Index or pd.DatetimeIndex
# depending on user input and pandas behavior
# See issue #2264
base_index_type = type(base_index)
locator_as_index = base_index_type(locator)
nan_labels = locator_as_index.difference(base_index)
common_labels = locator_as_index.intersection(base_index)
if len(common_labels) == 0:
raise KeyError(
'None of [{labels}] are in the [{base_index_name}]'.format(
labels=list(locator_as_index), base_index_name=base_index))
return nan_labels
def _expand_dim(self, row_lookup, col_lookup, ndim):
"""Expand the dimension if necessary.
This method is for cases like duplicate labels.
"""
many_rows = len(row_lookup) > 1
many_cols = len(col_lookup) > 1
if ndim == 0 and (many_rows or many_cols):
ndim = 1
if ndim == 1 and (many_rows and many_cols):
ndim = 2
return ndim
def _compute_lookup(self, row_loc, col_loc):
# We use reindex for list to avoid duplicates.
row_lookup = self.row_coord_df.loc[row_loc]
col_lookup = self.col_coord_df.loc[col_loc]
return row_lookup, col_lookup
class _iLoc_Indexer(_Location_Indexer_Base):
"""A indexer for ray_df.iloc[] functionality"""
def __getitem__(self, key):
row_loc, col_loc, ndim = _parse_tuple(key)
self._check_dtypes(row_loc)
self._check_dtypes(col_loc)
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
result = super(_iLoc_Indexer, self).__getitem__(
row_lookup, col_lookup, ndim)
return result
def __setitem__(self, key, item):
row_loc, col_loc, _ = _parse_tuple(key)
self._check_dtypes(row_loc)
self._check_dtypes(col_loc)
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
super(_iLoc_Indexer, self).__setitem__(
row_lookup, col_lookup, item)
def _compute_lookup(self, row_loc, col_loc):
# We use reindex for list to avoid duplicates.
return self.row_coord_df.iloc[row_loc], self.col_coord_df.iloc[col_loc]
def _check_dtypes(self, locator):
is_int = is_integer(locator)
is_int_slice = is_integer_slice(locator)
is_int_list = is_list_like(locator) and all(map(is_integer, locator))
is_bool_arr = is_boolean_array(locator)
if not any([is_int, is_int_slice, is_int_list, is_bool_arr]):
raise ValueError(_ILOC_INT_ONLY_ERROR)
class DataFrameView(DataFrame):
"""A subclass of DataFrame where the index can be smaller than blocks.
"""
def __init__(self, block_partitions, row_metadata, col_metadata, index,
columns):
self._block_partitions = block_partitions
self._row_metadata = row_metadata
self._col_metadata = col_metadata
self.index = index
self.columns = columns
def _get_block_partitions(self):
oid_arr = _mask_block_partitions(self._block_partitions_data,
self._row_metadata,
self._col_metadata)
return oid_arr
def _set_block_partitions(self, new_block_partitions):
self._block_partitions_data = new_block_partitions
_block_partitions = property(_get_block_partitions, _set_block_partitions)
-524
View File
@@ -1,524 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from itertools import chain
from io import BytesIO
import os
import re
import warnings
import pandas
from pyarrow.parquet import ParquetFile
from pandas.io.common import _infer_compression # don't depend on internal API
from .dataframe import ray, DataFrame
from . import get_npartitions
from .utils import from_pandas
# Parquet
def read_parquet(path, engine='auto', columns=None, **kwargs):
"""Load a parquet object from the file path, returning a DataFrame.
Ray DataFrame only supports pyarrow engine for now.
Args:
path: The filepath of the parquet file.
We only support local files for now.
engine: Ray only support pyarrow reader.
This argument doesn't do anything for now.
kwargs: Pass into parquet's read_row_group function.
"""
pf = ParquetFile(path)
n_rows = pf.metadata.num_rows
chunksize = n_rows // get_npartitions()
n_row_groups = pf.metadata.num_row_groups
idx_regex = re.compile('__index_level_\d+__')
columns = [
name for name in pf.metadata.schema.names if not idx_regex.match(name)
]
df_from_row_groups = [
_read_parquet_row_group.remote(path, columns, i, kwargs)
for i in range(n_row_groups)
]
splited_dfs = ray.get(
[_split_df.remote(df, chunksize) for df in df_from_row_groups])
df_remotes = list(chain.from_iterable(splited_dfs))
return DataFrame(row_partitions=df_remotes, columns=columns)
@ray.remote
def _read_parquet_row_group(path, columns, row_group_id, kwargs={}):
"""Read a parquet row_group given file_path.
"""
pf = ParquetFile(path)
df = pf.read_row_group(row_group_id, columns=columns, **kwargs).to_pandas()
return df
@ray.remote
def _split_df(pandas_df, chunksize):
"""Split a pandas_df into partitions.
Returns:
remote_df_ids ([ObjectID])
"""
dataframes = []
while len(pandas_df) > chunksize:
t_df = pandas_df[:chunksize]
t_df.reset_index(drop=True)
top = ray.put(t_df)
dataframes.append(top)
pandas_df = pandas_df[chunksize:]
else:
pandas_df = pandas_df.reset_index(drop=True)
dataframes.append(ray.put(pandas_df))
return dataframes
# CSV
def _compute_offset(fn, npartitions, ignore_first_line=False):
"""
Calculate the currect bytes offsets for a csv file.
Return a list of (start, end) tuple where the end == \n or EOF.
"""
total_bytes = os.path.getsize(fn)
bio = open(fn, 'rb')
if ignore_first_line:
start = len(bio.readline())
chunksize = (total_bytes - start) // npartitions
else:
start = 0
chunksize = total_bytes // npartitions
if chunksize == 0:
chunksize = 1
offsets = []
while start < total_bytes:
bio.seek(chunksize, 1) # Move forward {chunksize} bytes
extend_line = bio.readline() # Move after the next \n
total_offset = chunksize + len(extend_line)
# The position of the \n we just crossed.
new_line_cursor = start + total_offset - 1
offsets.append((start, new_line_cursor))
start = new_line_cursor + 1
bio.close()
return offsets
def _get_firstline(file_path):
bio = open(file_path, 'rb')
first = bio.readline()
bio.close()
return first
def _infer_column(first_line, kwargs={}):
return pandas.read_csv(BytesIO(first_line), **kwargs).columns
@ray.remote
def _read_csv_with_offset(fn, start, end, kwargs={}, header=b''):
kwargs["quoting"] = int(kwargs["quoting"]) # See issue #2078
bio = open(fn, 'rb')
bio.seek(start)
to_read = header + bio.read(end - start)
bio.close()
pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs)
index = pandas_df.index
# Partitions must have RangeIndex
pandas_df.index = pandas.RangeIndex(0, len(pandas_df))
return pandas_df, index
@ray.remote
def get_index(*partition_indices):
return partition_indices[0].append(partition_indices[1:])
def read_csv(filepath_or_buffer,
sep=',',
delimiter=None,
header='infer',
names=None,
index_col=None,
usecols=None,
squeeze=False,
prefix=None,
mangle_dupe_cols=True,
dtype=None,
engine=None,
converters=None,
true_values=None,
false_values=None,
skipinitialspace=False,
skiprows=None,
nrows=None,
na_values=None,
keep_default_na=True,
na_filter=True,
verbose=False,
skip_blank_lines=True,
parse_dates=False,
infer_datetime_format=False,
keep_date_col=False,
date_parser=None,
dayfirst=False,
iterator=False,
chunksize=None,
compression='infer',
thousands=None,
decimal=b'.',
lineterminator=None,
quotechar='"',
quoting=0,
escapechar=None,
comment=None,
encoding=None,
dialect=None,
tupleize_cols=None,
error_bad_lines=True,
warn_bad_lines=True,
skipfooter=0,
skip_footer=0,
doublequote=True,
delim_whitespace=False,
as_recarray=None,
compact_ints=None,
use_unsigned=None,
low_memory=True,
buffer_lines=None,
memory_map=False,
float_precision=None):
"""Read csv file from local disk.
Args:
filepath:
The filepath of the csv file.
We only support local files for now.
kwargs: Keyword arguments in pandas::from_csv
"""
kwargs = {
'sep': sep,
'delimiter': delimiter,
'header': header,
'names': names,
'index_col': index_col,
'usecols': usecols,
'squeeze': squeeze,
'prefix': prefix,
'mangle_dupe_cols': mangle_dupe_cols,
'dtype': dtype,
'engine': engine,
'converters': converters,
'true_values': true_values,
'false_values': false_values,
'skipinitialspace': skipinitialspace,
'skiprows': skiprows,
'nrows': nrows,
'na_values': na_values,
'keep_default_na': keep_default_na,
'na_filter': na_filter,
'verbose': verbose,
'skip_blank_lines': skip_blank_lines,
'parse_dates': parse_dates,
'infer_datetime_format': infer_datetime_format,
'keep_date_col': keep_date_col,
'date_parser': date_parser,
'dayfirst': dayfirst,
'iterator': iterator,
'chunksize': chunksize,
'compression': compression,
'thousands': thousands,
'decimal': decimal,
'lineterminator': lineterminator,
'quotechar': quotechar,
'quoting': quoting,
'escapechar': escapechar,
'comment': comment,
'encoding': encoding,
'dialect': dialect,
'tupleize_cols': tupleize_cols,
'error_bad_lines': error_bad_lines,
'warn_bad_lines': warn_bad_lines,
'skipfooter': skipfooter,
'skip_footer': skip_footer,
'doublequote': doublequote,
'delim_whitespace': delim_whitespace,
'as_recarray': as_recarray,
'compact_ints': compact_ints,
'use_unsigned': use_unsigned,
'low_memory': low_memory,
'buffer_lines': buffer_lines,
'memory_map': memory_map,
'float_precision': float_precision,
}
# Default to Pandas read_csv for non-serializable objects
if not isinstance(filepath_or_buffer, str) or \
_infer_compression(filepath_or_buffer, compression) is not None:
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
pandas_obj = pandas.read_csv(filepath_or_buffer, **kwargs)
if isinstance(pandas_obj, pandas.DataFrame):
return from_pandas(pandas_obj, get_npartitions())
return pandas_obj
filepath = filepath_or_buffer
# TODO: handle case where header is a list of lines
first_line = _get_firstline(filepath)
columns = _infer_column(first_line, kwargs=kwargs)
if header is None or (header == "infer" and names is not None):
first_line = b""
ignore_first_line = False
else:
ignore_first_line = True
offsets = _compute_offset(filepath, get_npartitions(),
ignore_first_line=ignore_first_line)
# Serialize objects to speed up later use in remote tasks
first_line_id = ray.put(first_line)
kwargs_id = ray.put(kwargs)
df_obj_ids = []
index_obj_ids = []
for start, end in offsets:
if start != 0:
df, index = _read_csv_with_offset._submit(
args=(filepath, start, end, kwargs_id, first_line_id),
num_return_vals=2)
else:
df, index = _read_csv_with_offset._submit(
args=(filepath, start, end, kwargs_id),
num_return_vals=2)
df_obj_ids.append(df)
index_obj_ids.append(index)
index = get_index.remote(*index_obj_ids) if index_col is not None else None
return DataFrame(row_partitions=df_obj_ids, columns=columns, index=index)
def read_json(path_or_buf=None,
orient=None,
typ='frame',
dtype=True,
convert_axes=True,
convert_dates=True,
keep_default_dates=True,
numpy=False,
precise_float=False,
date_unit=None,
encoding=None,
lines=False,
chunksize=None,
compression='infer'):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_json(path_or_buf, orient, typ, dtype,
convert_axes, convert_dates,
keep_default_dates, numpy, precise_float,
date_unit, encoding, lines, chunksize,
compression)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_html(io,
match='.+',
flavor=None,
header=None,
index_col=None,
skiprows=None,
attrs=None,
parse_dates=False,
tupleize_cols=None,
thousands=',',
encoding=None,
decimal='.',
converters=None,
na_values=None,
keep_default_na=True):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_html(io, match, flavor, header, index_col,
skiprows, attrs, parse_dates, tupleize_cols,
thousands, encoding, decimal, converters,
na_values, keep_default_na)
ray_frame = from_pandas(port_frame[0], get_npartitions())
return ray_frame
def read_clipboard(sep=r'\s+'):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_clipboard(sep)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_excel(io,
sheet_name=0,
header=0,
skiprows=None,
skip_footer=0,
index_col=None,
names=None,
usecols=None,
parse_dates=False,
date_parser=None,
na_values=None,
thousands=None,
convert_float=True,
converters=None,
dtype=None,
true_values=None,
false_values=None,
engine=None,
squeeze=False):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_excel(io, sheet_name, header, skiprows,
skip_footer, index_col, names, usecols,
parse_dates, date_parser, na_values,
thousands, convert_float, converters, dtype,
true_values, false_values, engine, squeeze)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_hdf(path_or_buf,
key=None,
mode='r'):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_hdf(path_or_buf, key, mode)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_feather(path,
nthreads=1):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_feather(path)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_msgpack(path_or_buf,
encoding='utf-8',
iterator=False):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_msgpack(path_or_buf, encoding, iterator)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_stata(filepath_or_buffer,
convert_dates=True,
convert_categoricals=True,
encoding=None,
index_col=None,
convert_missing=False,
preserve_dtypes=True,
columns=None,
order_categoricals=True,
chunksize=None,
iterator=False):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_stata(filepath_or_buffer, convert_dates,
convert_categoricals, encoding, index_col,
convert_missing, preserve_dtypes, columns,
order_categoricals, chunksize, iterator)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_sas(filepath_or_buffer,
format=None,
index=None,
encoding=None,
chunksize=None,
iterator=False):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_sas(filepath_or_buffer, format, index, encoding,
chunksize, iterator)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_pickle(path,
compression='infer'):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_pickle(path, compression)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_sql(sql,
con,
index_col=None,
coerce_float=True,
params=None,
parse_dates=None,
columns=None,
chunksize=None):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_sql(sql, con, index_col, coerce_float, params,
parse_dates, columns, chunksize)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
-29
View File
@@ -1,29 +0,0 @@
from collections import Iterator
class PartitionIterator(Iterator):
def __init__(self, partitions, func):
"""PartitionIterator class to define a generator on partitioned data
Args:
partitions ([ObjectID]): Partitions to iterate over
func (callable): The function to get inner iterables from
each partition
"""
self.partitions = iter(partitions)
self.func = func
self.iter_cache = iter([])
def __iter__(self):
return self
def __next__(self):
return self.next()
def next(self):
try:
return next(self.iter_cache)
except StopIteration:
next_partition = next(self.partitions)
self.iter_cache = self.func(next_partition)
return self.next()
-94
View File
@@ -1,94 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import inspect
def code_gen(pandas_obj, ray_obj, path):
"""Generate code skeleton for methods not in Ray
Args:
pandas_obj: The pandas object to generate code from.
ray_obj: The ray object to diff against.
path: Path to output the file to.
"""
with open(path, "w") as outfile:
funcs = pandas_ray_diff(pandas_obj, ray_obj)
for func in funcs:
if func[0] == "_" and func[1] != "_":
continue
if "attr" in func:
# let's not mess with these
continue
try:
outfile.write(
"\ndef " + func +
str(inspect.signature(getattr(pandas_obj, func))) + ":\n")
except TypeError:
outfile.write("\n@property")
outfile.write("\ndef " + func + "(self):\n")
except ValueError:
continue
outfile.write(
" raise NotImplementedError(\"Not Yet implemented.\")\n")
def code_gen_test(ray_obj, path, name):
"""Generate tests for methods in Ray."""
with open(path, "a") as outfile:
funcs = dir(ray_obj)
for func in funcs:
if func[0] == "_" and func[1] != "_":
continue
outfile.write("\n\ndef test_" + func + "():\n")
outfile.write(
" ray_" + name + " = create_test_" + name + "()\n\n" +
" with pytest.raises(NotImplementedError):\n" +
" ray_" + name + "." + func)
try:
first = True
param_num = \
len(inspect.signature(getattr(ray_obj, func)).parameters)
if param_num > 1:
param_num -= 1
for _ in range(param_num):
if first:
outfile.write("(None")
first = False
else:
outfile.write(", None")
except (TypeError, ValueError, NotImplementedError):
outfile.write("\n")
continue
if first:
outfile.write("(")
outfile.write(")\n")
def pandas_ray_diff(pandas_obj, ray_obj):
"""Gets the diff of the methods in the Pandas and Ray objects.
Args:
pandas_obj: The Pandas object to diff.
ray_obj: The Ray object to diff.
Returns:
A list of method names that are different between the two.
"""
pandas_funcs = dir(pandas_obj)
ray_funcs = dir(ray_obj)
pandas_funcs = set(filter(lambda f: f[0] != "_" or f[1] == "_",
pandas_funcs))
diff = [x for x in pandas_funcs if x not in set(ray_funcs)]
return diff
-125
View File
@@ -1,125 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import ray
import pandas
import numpy as np
from pandas import compat
from pandas.core.dtypes.common import is_list_like
from itertools import cycle
from .dataframe import DataFrame
from .utils import _deploy_func
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
columns=None, sparse=False, drop_first=False):
"""Convert categorical variable into indicator variables.
Args:
data (array-like, Series, or DataFrame): data to encode.
prefix (string, [string]): Prefix to apply to each encoded column
label.
prefix_sep (string, [string]): Separator between prefix and value.
dummy_na (bool): Add a column to indicate NaNs.
columns: Which columns to encode.
sparse (bool): Not Implemented: If True, returns SparseDataFrame.
drop_first (bool): Whether to remove the first level of encoded data.
Returns:
DataFrame or one-hot encoded data.
"""
if not isinstance(data, DataFrame):
return pandas.get_dummies(data, prefix=prefix, prefix_sep=prefix_sep,
dummy_na=dummy_na, columns=columns,
sparse=sparse, drop_first=drop_first)
if sparse:
raise NotImplementedError(
"SparseDataFrame is not implemented. "
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")
if columns is None:
columns_to_encode = data.dtypes.isin([np.dtype("O"), 'category'])
columns_to_encode = data.columns[columns_to_encode]
else:
columns_to_encode = columns
def check_len(item, name):
len_msg = ("Length of '{name}' ({len_item}) did not match the "
"length of the columns being encoded ({len_enc}).")
if is_list_like(item):
if not len(item) == len(columns_to_encode):
len_msg = len_msg.format(name=name, len_item=len(item),
len_enc=len(columns_to_encode))
raise ValueError(len_msg)
check_len(prefix, 'prefix')
check_len(prefix_sep, 'prefix_sep')
if isinstance(prefix, compat.string_types):
prefix = cycle([prefix])
prefix = [next(prefix) for i in range(len(columns_to_encode))]
if isinstance(prefix, dict):
prefix = [prefix[col] for col in columns_to_encode]
if prefix is None:
prefix = columns_to_encode
# validate separators
if isinstance(prefix_sep, compat.string_types):
prefix_sep = cycle([prefix_sep])
prefix_sep = [next(prefix_sep) for i in range(len(columns_to_encode))]
elif isinstance(prefix_sep, dict):
prefix_sep = [prefix_sep[col] for col in columns_to_encode]
if set(columns_to_encode) == set(data.columns):
with_dummies = []
dropped_columns = pandas.Index()
else:
with_dummies = data.drop(columns_to_encode, axis=1)._col_partitions
dropped_columns = data.columns.drop(columns_to_encode)
def get_dummies_remote(df, to_drop, prefix, prefix_sep):
df = df.drop(to_drop, axis=1)
if df.size == 0:
return df, df.columns
df = pandas.get_dummies(df, prefix=prefix, prefix_sep=prefix_sep,
dummy_na=dummy_na, columns=None, sparse=sparse,
drop_first=drop_first)
columns = df.columns
df.columns = pandas.RangeIndex(0, len(df.columns))
return df, columns
total = 0
columns = []
for i, part in enumerate(data._col_partitions):
col_index = data._col_metadata.partition_series(i)
# TODO(kunalgosar): Handle the case of duplicate columns here
to_encode = col_index.index.isin(columns_to_encode)
to_encode = col_index[to_encode]
to_drop = col_index.drop(to_encode.index)
result = _deploy_func._submit(
args=(get_dummies_remote, part, to_drop,
prefix[total:total + len(to_encode)],
prefix_sep[total:total + len(to_encode)]),
num_return_vals=2)
with_dummies.append(result[0])
columns.append(result[1])
total += len(to_encode)
columns = ray.get(columns)
dropped_columns = dropped_columns.append(columns)
return DataFrame(col_partitions=with_dummies,
columns=dropped_columns,
index=data.index)
-970
View File
@@ -1,970 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import pandas
from .utils import _inherit_docstrings
def na_op():
"""Pandas uses a similar function to handle na values.
"""
raise NotImplementedError("Not Yet implemented.")
@_inherit_docstrings(pandas.Series, excluded=[pandas.Series,
pandas.Series.__init__])
class Series(object):
def __init__(self, series_oids):
"""Constructor for a Series object.
Args:
series_oids ([ObjectID]): The list of remote Series objects.
"""
self.series_oids = series_oids
@property
def T(self):
raise NotImplementedError("Not Yet implemented.")
def __abs__(self):
raise NotImplementedError("Not Yet implemented.")
def __add__(self, right, name='__add__', na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
def __and__(self, other):
raise NotImplementedError("Not Yet implemented.")
def __array__(self, result=None):
raise NotImplementedError("Not Yet implemented.")
def __array_prepare__(self, result, context=None):
raise NotImplementedError("Not Yet implemented.")
@property
def __array_priority__(self):
raise NotImplementedError("Not Yet implemented.")
def __array_wrap__(self, result, context=None):
raise NotImplementedError("Not Yet implemented.")
def __bool__(self):
raise NotImplementedError("Not Yet implemented.")
def __bytes__(self):
raise NotImplementedError("Not Yet implemented.")
def __class__(self, data=None, index=None, dtype=None, name=None,
copy=False, fastpath=False):
raise NotImplementedError("Not Yet implemented.")
def __contains__(self, key):
raise NotImplementedError("Not Yet implemented.")
def __copy__(self, deep=True):
raise NotImplementedError("Not Yet implemented.")
def __deepcopy__(self, memo=None):
raise NotImplementedError("Not Yet implemented.")
def __delitem__(self, key):
raise NotImplementedError("Not Yet implemented.")
def __dir__(self):
return list(type(self).__dict__.keys())
def __div__(self, right, name='__truediv__', na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
def __divmod__(self, right, name='__divmod__', na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
@property
def __doc__(self):
raise NotImplementedError("Not Yet implemented.")
def __eq__(self, other, axis=None):
raise NotImplementedError("Not Yet implemented.")
def __finalize__(self, other, method=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def __float__(self):
raise NotImplementedError("Not Yet implemented.")
def __floordiv__(self, right, name='__floordiv__', na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
def __ge__(self, other, axis=None):
raise NotImplementedError("Not Yet implemented.")
def __getitem__(self, key):
raise NotImplementedError("Not Yet implemented.")
def __getstate__(self):
raise NotImplementedError("Not Yet implemented.")
def __gt__(self, other, axis=None):
raise NotImplementedError("Not Yet implemented.")
def __iadd__(self, other):
raise NotImplementedError("Not Yet implemented.")
def __imul__(self, other):
raise NotImplementedError("Not Yet implemented.")
def __int__(self):
raise NotImplementedError("Not Yet implemented.")
def __invert__(self):
raise NotImplementedError("Not Yet implemented.")
def __ipow__(self, other):
raise NotImplementedError("Not Yet implemented.")
def __isub__(self, other):
raise NotImplementedError("Not Yet implemented.")
def __iter__(self):
raise NotImplementedError("Not Yet implemented.")
def __itruediv__(self, other):
raise NotImplementedError("Not Yet implemented.")
def __le__(self, other, axis=None):
raise NotImplementedError("Not Yet implemented.")
def __len__(self):
raise NotImplementedError("Not Yet implemented.")
def __long__(self):
raise NotImplementedError("Not Yet implemented.")
def __lt__(self, other, axis=None):
raise NotImplementedError("Not Yet implemented.")
def __mod__(self, right, name='__mod__', na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
def __mul__(self, right, name='__mul__', na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
def __ne__(self, other, axis=None):
raise NotImplementedError("Not Yet implemented.")
def __neg__(self):
raise NotImplementedError("Not Yet implemented.")
def __nonzero__(self):
raise NotImplementedError("Not Yet implemented.")
def __or__(self, other):
raise NotImplementedError("Not Yet implemented.")
def __pow__(self, right, name='__pow__', na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
def __repr__(self):
raise NotImplementedError("Not Yet implemented.")
def __round__(self, decimals=0):
raise NotImplementedError("Not Yet implemented.")
def __setitem__(self, key, value):
raise NotImplementedError("Not Yet implemented.")
def __setstate__(self, state):
raise NotImplementedError("Not Yet implemented.")
def __sizeof__(self):
raise NotImplementedError("Not Yet implemented.")
def __str__(self):
raise NotImplementedError("Not Yet implemented.")
def __sub__(self, right, name='__sub__', na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
def __truediv__(self, right, name='__truediv__', na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
def __xor__(self, other):
raise NotImplementedError("Not Yet implemented.")
def abs(self):
raise NotImplementedError("Not Yet implemented.")
def add(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def add_prefix(self, prefix):
raise NotImplementedError("Not Yet implemented.")
def add_suffix(self, suffix):
raise NotImplementedError("Not Yet implemented.")
def agg(self, func, axis=0, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def aggregate(self, func, axis=0, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def align(self, other, join='outer', axis=None, level=None, copy=True,
fill_value=None, method=None, limit=None, fill_axis=0,
broadcast_axis=None):
raise NotImplementedError("Not Yet implemented.")
def all(self, axis=None, bool_only=None, skipna=None, level=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def any(self, axis=None, bool_only=None, skipna=None, level=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def append(self, to_append, ignore_index=False, verify_integrity=False):
raise NotImplementedError("Not Yet implemented.")
def apply(self, func, convert_dtype=True, args=(), **kwds):
raise NotImplementedError("Not Yet implemented.")
def argmax(self, axis=None, skipna=True, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def argmin(self, axis=None, skipna=True, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def argsort(self, axis=0, kind='quicksort', order=None):
raise NotImplementedError("Not Yet implemented.")
def as_blocks(self, copy=True):
raise NotImplementedError("Not Yet implemented.")
def as_matrix(self, columns=None):
raise NotImplementedError("Not Yet implemented.")
def asfreq(self, freq, method=None, how=None, normalize=False,
fill_value=None):
raise NotImplementedError("Not Yet implemented.")
def asof(self, where, subset=None):
raise NotImplementedError("Not Yet implemented.")
def astype(self, dtype, copy=True, errors='raise', **kwargs):
raise NotImplementedError("Not Yet implemented.")
def at(self, axis=None):
raise NotImplementedError("Not Yet implemented.")
def at_time(self, time, asof=False):
raise NotImplementedError("Not Yet implemented.")
def autocorr(self, lag=1):
raise NotImplementedError("Not Yet implemented.")
def between(self, left, right, inclusive=True):
raise NotImplementedError("Not Yet implemented.")
def between_time(self, start_time, end_time, include_start=True,
include_end=True):
raise NotImplementedError("Not Yet implemented.")
def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
raise NotImplementedError("Not Yet implemented.")
def bool(self):
raise NotImplementedError("Not Yet implemented.")
def clip(self, lower=None, upper=None, axis=None, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def clip_lower(self, threshold, axis=None):
raise NotImplementedError("Not Yet implemented.")
def clip_upper(self, threshold, axis=None):
raise NotImplementedError("Not Yet implemented.")
def combine(self, other, func, fill_value=np.nan):
raise NotImplementedError("Not Yet implemented.")
def combine_first(self, other):
raise NotImplementedError("Not Yet implemented.")
def compound(self, axis=None, skipna=None, level=None):
raise NotImplementedError("Not Yet implemented.")
def compress(self, condition, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def consolidate(self, inplace=False):
raise NotImplementedError("Not Yet implemented.")
def convert_objects(self, convert_dates=True, convert_numeric=False,
convert_timedeltas=True, copy=True):
raise NotImplementedError("Not Yet implemented.")
def copy(self, deep=True):
raise NotImplementedError("Not Yet implemented.")
def corr(self, other, method='pearson', min_periods=None):
raise NotImplementedError("Not Yet implemented.")
def count(self, level=None):
raise NotImplementedError("Not Yet implemented.")
def cov(self, other, min_periods=None):
raise NotImplementedError("Not Yet implemented.")
def cummax(self, axis=None, skipna=True, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def cummin(self, axis=None, skipna=True, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def cumprod(self, axis=None, skipna=True, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def cumsum(self, axis=None, skipna=True, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def describe(self, percentiles=None, include=None, exclude=None):
raise NotImplementedError("Not Yet implemented.")
def diff(self, periods=1):
raise NotImplementedError("Not Yet implemented.")
def div(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def divide(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def dot(self, other):
raise NotImplementedError("Not Yet implemented.")
def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'):
raise NotImplementedError("Not Yet implemented.")
def drop_duplicates(self, keep='first', inplace=False):
raise NotImplementedError("Not Yet implemented.")
def dropna(self, axis=0, inplace=False, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def duplicated(self, keep='first'):
raise NotImplementedError("Not Yet implemented.")
def eq(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def equals(self, other):
raise NotImplementedError("Not Yet implemented.")
def ewm(self, com=None, span=None, halflife=None, alpha=None,
min_periods=0, freq=None, adjust=True, ignore_na=False, axis=0):
raise NotImplementedError("Not Yet implemented.")
def expanding(self, min_periods=1, freq=None, center=False, axis=0):
raise NotImplementedError("Not Yet implemented.")
def factorize(self, sort=False, na_sentinel=-1):
raise NotImplementedError("Not Yet implemented.")
def ffill(self, axis=None, inplace=False, limit=None, downcast=None):
raise NotImplementedError("Not Yet implemented.")
def fillna(self, value=None, method=None, axis=None, inplace=False,
limit=None, downcast=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def filter(self, items=None, like=None, regex=None, axis=None):
raise NotImplementedError("Not Yet implemented.")
def first(self, offset):
raise NotImplementedError("Not Yet implemented.")
def first_valid_index(self):
raise NotImplementedError("Not Yet implemented.")
def floordiv(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def from_array(self, arr, index=None, name=None, dtype=None, copy=False,
fastpath=False):
raise NotImplementedError("Not Yet implemented.")
def from_csv(self, path, sep=',', parse_dates=True, header=None,
index_col=0, encoding=None, infer_datetime_format=False):
raise NotImplementedError("Not Yet implemented.")
def ge(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def get(self, key, default=None):
raise NotImplementedError("Not Yet implemented.")
def get_dtype_counts(self):
raise NotImplementedError("Not Yet implemented.")
def get_ftype_counts(self):
raise NotImplementedError("Not Yet implemented.")
def get_value(self, label, takeable=False):
raise NotImplementedError("Not Yet implemented.")
def get_values(self):
raise NotImplementedError("Not Yet implemented.")
def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
group_keys=True, squeeze=False, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def gt(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def head(self, n=5):
raise NotImplementedError("Not Yet implemented.")
def hist(self, by=None, ax=None, grid=True, xlabelsize=None, xrot=None,
ylabelsize=None, yrot=None, figsize=None, bins=10, **kwds):
raise NotImplementedError("Not Yet implemented.")
def iat(self, axis=None):
raise NotImplementedError("Not Yet implemented.")
def idxmax(self, axis=None, skipna=True, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def idxmin(self, axis=None, skipna=True, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def iloc(self, axis=None):
raise NotImplementedError("Not Yet implemented.")
def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
limit_direction='forward', downcast=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def isin(self, values):
raise NotImplementedError("Not Yet implemented.")
def isnull(self):
raise NotImplementedError("Not Yet implemented.")
def item(self):
raise NotImplementedError("Not Yet implemented.")
def items(self):
raise NotImplementedError("Not Yet implemented.")
def iteritems(self):
raise NotImplementedError("Not Yet implemented.")
def ix(self, axis=None):
raise NotImplementedError("Not Yet implemented.")
def keys(self):
raise NotImplementedError("Not Yet implemented.")
def kurt(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def last(self, offset):
raise NotImplementedError("Not Yet implemented.")
def last_valid_index(self):
raise NotImplementedError("Not Yet implemented.")
def le(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def loc(self, axis=None):
raise NotImplementedError("Not Yet implemented.")
def lt(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def mad(self, axis=None, skipna=None, level=None):
raise NotImplementedError("Not Yet implemented.")
def map(self, arg, na_action=None):
raise NotImplementedError("Not Yet implemented.")
def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None,
try_cast=False, raise_on_error=True):
raise NotImplementedError("Not Yet implemented.")
def max(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def mean(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def median(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def memory_usage(self, index=True, deep=False):
raise NotImplementedError("Not Yet implemented.")
def min(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def mod(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def mode(self):
raise NotImplementedError("Not Yet implemented.")
def mul(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def multiply(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def ne(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def nlargest(self, n=5, keep='first'):
raise NotImplementedError("Not Yet implemented.")
def nonzero(self):
raise NotImplementedError("Not Yet implemented.")
def notnull(self):
raise NotImplementedError("Not Yet implemented.")
def nsmallest(self, n=5, keep='first'):
raise NotImplementedError("Not Yet implemented.")
def nunique(self, dropna=True):
raise NotImplementedError("Not Yet implemented.")
def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def pipe(self, func, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def plot(self, kind='line', ax=None, figsize=None, use_index=True,
title=None, grid=None, legend=False, style=None, logx=False,
logy=False, loglog=False, xticks=None, yticks=None, xlim=None,
ylim=None, rot=None, fontsize=None, colormap=None, table=False,
yerr=None, xerr=None, label=None, secondary_y=False, **kwds):
raise NotImplementedError("Not Yet implemented.")
def pop(self, item):
raise NotImplementedError("Not Yet implemented.")
def pow(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def prod(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def product(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def ptp(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def put(self, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def quantile(self, q=0.5, interpolation='linear'):
raise NotImplementedError("Not Yet implemented.")
def radd(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def rank(self, axis=0, method='average', numeric_only=None,
na_option='keep', ascending=True, pct=False):
raise NotImplementedError("Not Yet implemented.")
def ravel(self, order='C'):
raise NotImplementedError("Not Yet implemented.")
def rdiv(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def reindex(self, index=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def reindex_axis(self, labels, axis=0, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def reindex_like(self, other, method=None, copy=True, limit=None,
tolerance=None):
raise NotImplementedError("Not Yet implemented.")
def rename(self, index=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def rename_axis(self, mapper, axis=0, copy=True, inplace=False):
raise NotImplementedError("Not Yet implemented.")
def reorder_levels(self, order):
raise NotImplementedError("Not Yet implemented.")
def repeat(self, repeats, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def replace(self, to_replace=None, value=None, inplace=False, limit=None,
regex=False, method='pad', axis=None):
raise NotImplementedError("Not Yet implemented.")
def resample(self, rule, how=None, axis=0, fill_method=None, closed=None,
label=None, convention='start', kind=None, loffset=None,
limit=None, base=0, on=None, level=None):
raise NotImplementedError("Not Yet implemented.")
def reset_index(self, level=None, drop=False, name=None, inplace=False):
raise NotImplementedError("Not Yet implemented.")
def reshape(self, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def rfloordiv(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def rmod(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def rmul(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def rolling(self, window, min_periods=None, freq=None, center=False,
win_type=None, on=None, axis=0, closed=None):
raise NotImplementedError("Not Yet implemented.")
def round(self, decimals=0, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def rpow(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def rsub(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def rtruediv(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def sample(self, n=None, frac=None, replace=False, weights=None,
random_state=None, axis=None):
raise NotImplementedError("Not Yet implemented.")
def searchsorted(self, value, side='left', sorter=None):
raise NotImplementedError("Not Yet implemented.")
def select(self, crit, axis=0):
raise NotImplementedError("Not Yet implemented.")
def sem(self, axis=None, skipna=None, level=None, ddof=1,
numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def set_axis(self, axis, labels):
raise NotImplementedError("Not Yet implemented.")
def set_value(self, label, value, takeable=False):
raise NotImplementedError("Not Yet implemented.")
def shift(self, periods=1, freq=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def skew(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def slice_shift(self, periods=1, axis=0):
raise NotImplementedError("Not Yet implemented.")
def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
kind='quicksort', na_position='last', sort_remaining=True):
raise NotImplementedError("Not Yet implemented.")
def sort_values(self, axis=0, ascending=True, inplace=False,
kind='quicksort', na_position='last'):
raise NotImplementedError("Not Yet implemented.")
def sortlevel(self, level=0, ascending=True, sort_remaining=True):
raise NotImplementedError("Not Yet implemented.")
def squeeze(self, axis=None):
raise NotImplementedError("Not Yet implemented.")
def std(self, axis=None, skipna=None, level=None, ddof=1,
numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def sub(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def subtract(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def sum(self, axis=None, skipna=None, level=None, numeric_only=None,
**kwargs):
raise NotImplementedError("Not Yet implemented.")
def swapaxes(self, axis1, axis2, copy=True):
raise NotImplementedError("Not Yet implemented.")
def swaplevel(self, i=-2, j=-1, copy=True):
raise NotImplementedError("Not Yet implemented.")
def tail(self, n=5):
raise NotImplementedError("Not Yet implemented.")
def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def to_clipboard(self, excel=None, sep=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def to_csv(self, path=None, index=True, sep=',', na_rep='',
float_format=None, header=False, index_label=None, mode='w',
encoding=None, date_format=None, decimal='.'):
raise NotImplementedError("Not Yet implemented.")
def to_dense(self):
raise NotImplementedError("Not Yet implemented.")
def to_dict(self):
raise NotImplementedError("Not Yet implemented.")
def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
float_format=None, columns=None, header=True, index=True,
index_label=None, startrow=0, startcol=0, engine=None,
merge_cells=True, encoding=None, inf_rep='inf',
verbose=True):
raise NotImplementedError("Not Yet implemented.")
def to_frame(self, name=None):
raise NotImplementedError("Not Yet implemented.")
def to_hdf(self, path_or_buf, key, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def to_json(self, path_or_buf=None, orient=None, date_format=None,
double_precision=10, force_ascii=True, date_unit='ms',
default_handler=None, lines=False):
raise NotImplementedError("Not Yet implemented.")
def to_latex(self, buf=None, columns=None, col_space=None, header=True,
index=True, na_rep='NaN', formatters=None, float_format=None,
sparsify=None, index_names=True, bold_rows=False,
column_format=None, longtable=None, escape=None,
encoding=None, decimal='.', multicolumn=None,
multicolumn_format=None, multirow=None):
raise NotImplementedError("Not Yet implemented.")
def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
raise NotImplementedError("Not Yet implemented.")
def to_period(self, freq=None, copy=True):
raise NotImplementedError("Not Yet implemented.")
def to_pickle(self, path, compression='infer'):
raise NotImplementedError("Not Yet implemented.")
def to_sparse(self, kind='block', fill_value=None):
raise NotImplementedError("Not Yet implemented.")
def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
index=True, index_label=None, chunksize=None, dtype=None):
raise NotImplementedError("Not Yet implemented.")
def to_string(self, buf=None, na_rep='NaN', float_format=None,
header=True, index=True, length=False, dtype=False,
name=False, max_rows=None):
raise NotImplementedError("Not Yet implemented.")
def to_timestamp(self, freq=None, how='start', copy=True):
raise NotImplementedError("Not Yet implemented.")
def to_xarray(self):
raise NotImplementedError("Not Yet implemented.")
def tolist(self):
raise NotImplementedError("Not Yet implemented.")
def transform(self, func, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def transpose(self, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def truediv(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def truncate(self, before=None, after=None, axis=None, copy=True):
raise NotImplementedError("Not Yet implemented.")
def tshift(self, periods=1, freq=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
def tz_convert(self, tz, axis=0, level=None, copy=True):
raise NotImplementedError("Not Yet implemented.")
def tz_localize(self, tz, axis=0, level=None, copy=True,
ambiguous='raise'):
raise NotImplementedError("Not Yet implemented.")
def unique(self):
raise NotImplementedError("Not Yet implemented.")
def unstack(self, level=-1, fill_value=None):
raise NotImplementedError("Not Yet implemented.")
def upandasate(self, other):
raise NotImplementedError("Not Yet implemented.")
def valid(self, inplace=False, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def value_counts(self, normalize=False, sort=True, ascending=False,
bins=None, dropna=True):
raise NotImplementedError("Not Yet implemented.")
def var(self, axis=None, skipna=None, level=None, ddof=1,
numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def view(self, dtype=None):
raise NotImplementedError("Not Yet implemented.")
def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
try_cast=False, raise_on_error=True):
raise NotImplementedError("Not Yet implemented.")
def xs(key, axis=0, level=None, drop_level=True):
raise NotImplementedError("Not Yet implemented.")
@property
def asobject(self):
raise NotImplementedError("Not Yet implemented.")
@property
def axes(self):
raise NotImplementedError("Not Yet implemented.")
@property
def base(self):
raise NotImplementedError("Not Yet implemented.")
@property
def blocks(self):
raise NotImplementedError("Not Yet implemented.")
@property
def data(self):
raise NotImplementedError("Not Yet implemented.")
@property
def dtype(self):
raise NotImplementedError("Not Yet implemented.")
@property
def dtypes(self):
raise NotImplementedError("Not Yet implemented.")
@property
def empty(self):
raise NotImplementedError("Not Yet implemented.")
@property
def flags(self):
raise NotImplementedError("Not Yet implemented.")
@property
def ftype(self):
raise NotImplementedError("Not Yet implemented.")
@property
def ftypes(self):
raise NotImplementedError("Not Yet implemented.")
@property
def hasnans(self):
raise NotImplementedError("Not Yet implemented.")
@property
def imag(self):
raise NotImplementedError("Not Yet implemented.")
@property
def index(self):
raise NotImplementedError("Not Yet implemented.")
@property
def is_copy(self):
raise NotImplementedError("Not Yet implemented.")
@property
def is_monotonic(self):
raise NotImplementedError("Not Yet implemented.")
@property
def is_monotonic_decreasing(self):
raise NotImplementedError("Not Yet implemented.")
@property
def is_monotonic_increasing(self):
raise NotImplementedError("Not Yet implemented.")
@property
def is_unique(self):
raise NotImplementedError("Not Yet implemented.")
@property
def itemsize(self):
raise NotImplementedError("Not Yet implemented.")
@property
def name(self):
raise NotImplementedError("Not Yet implemented.")
@property
def nbytes(self):
raise NotImplementedError("Not Yet implemented.")
@property
def ndim(self):
raise NotImplementedError("Not Yet implemented.")
@property
def real(self):
raise NotImplementedError("Not Yet implemented.")
@property
def shape(self):
raise NotImplementedError("Not Yet implemented.")
@property
def size(self):
raise NotImplementedError("Not Yet implemented.")
@property
def strides(self):
raise NotImplementedError("Not Yet implemented.")
@property
def values(self):
raise NotImplementedError("Not Yet implemented.")
-127
View File
@@ -1,127 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pytest
import pandas
import ray.dataframe as pd
from ray.dataframe.utils import (
to_pandas,
from_pandas
)
@pytest.fixture
def ray_df_equals_pandas(ray_df, pandas_df):
return to_pandas(ray_df).sort_index().equals(pandas_df.sort_index())
@pytest.fixture
def generate_dfs():
df = pandas.DataFrame({'col1': [0, 1, 2, 3],
'col2': [4, 5, 6, 7],
'col3': [8, 9, 10, 11],
'col4': [12, 13, 14, 15],
'col5': [0, 0, 0, 0]})
df2 = pandas.DataFrame({'col1': [0, 1, 2, 3],
'col2': [4, 5, 6, 7],
'col3': [8, 9, 10, 11],
'col6': [12, 13, 14, 15],
'col7': [0, 0, 0, 0]})
return df, df2
@pytest.fixture
def generate_none_dfs():
df = pandas.DataFrame({'col1': [0, 1, 2, 3],
'col2': [4, 5, None, 7],
'col3': [8, 9, 10, 11],
'col4': [12, 13, 14, 15],
'col5': [None, None, None, None]})
df2 = pandas.DataFrame({'col1': [0, 1, 2, 3],
'col2': [4, 5, 6, 7],
'col3': [8, 9, 10, 11],
'col6': [12, 13, 14, 15],
'col7': [0, 0, 0, 0]})
return df, df2
@pytest.fixture
def test_df_concat():
df, df2 = generate_dfs()
assert(ray_df_equals_pandas(pd.concat([df, df2]),
pandas.concat([df, df2])))
def test_ray_concat():
df, df2 = generate_dfs()
ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2]),
pandas.concat([df, df2]))
def test_ray_concat_on_index():
df, df2 = generate_dfs()
ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='index'),
pandas.concat([df, df2], axis='index'))
assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='rows'),
pandas.concat([df, df2], axis='rows'))
assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=0),
pandas.concat([df, df2], axis=0))
def test_ray_concat_on_column():
df, df2 = generate_dfs()
ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=1),
pandas.concat([df, df2], axis=1))
assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis="columns"),
pandas.concat([df, df2], axis="columns"))
def test_invalid_axis_errors():
df, df2 = generate_dfs()
ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
with pytest.raises(ValueError):
pd.concat([ray_df, ray_df2], axis=2)
def test_mixed_concat():
df, df2 = generate_dfs()
df3 = df.copy()
mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]
assert(ray_df_equals_pandas(pd.concat(mixed_dfs),
pandas.concat([df, df2, df3])))
def test_mixed_inner_concat():
df, df2 = generate_dfs()
df3 = df.copy()
mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]
assert(ray_df_equals_pandas(pd.concat(mixed_dfs, join='inner'),
pandas.concat([df, df2, df3], join='inner')))
def test_mixed_none_concat():
df, df2 = generate_none_dfs()
df3 = df.copy()
mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]
assert(ray_df_equals_pandas(pd.concat(mixed_dfs),
pandas.concat([df, df2, df3])))
File diff suppressed because it is too large Load Diff
-589
View File
@@ -1,589 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pytest
import sys
import pandas
import numpy as np
import ray.dataframe as pd
from ray.dataframe.utils import (
from_pandas,
to_pandas)
PY2 = False
if sys.version_info.major < 3:
PY2 = True
@pytest.fixture
def ray_df_equals_pandas(ray_df, pandas_df):
assert isinstance(ray_df, pd.DataFrame)
assert to_pandas(ray_df).equals(pandas_df)
@pytest.fixture
def ray_df_almost_equals_pandas(ray_df, pandas_df):
assert isinstance(ray_df, pd.DataFrame)
difference = to_pandas(ray_df) - pandas_df
diff_max = difference.max().max()
assert to_pandas(ray_df).equals(pandas_df) or diff_max < 0.0001
@pytest.fixture
def ray_series_equals_pandas(ray_df, pandas_df):
assert ray_df.equals(pandas_df)
@pytest.fixture
def ray_df_equals(ray_df1, ray_df2):
assert to_pandas(ray_df1).equals(to_pandas(ray_df2))
@pytest.fixture
def ray_groupby_equals_pandas(ray_groupby, pandas_groupby):
for g1, g2 in zip(ray_groupby, pandas_groupby):
assert g1[0] == g2[0]
ray_df_equals_pandas(g1[1], g2[1])
def test_simple_row_groupby():
pandas_df = pandas.DataFrame({'col1': [0, 1, 2, 3],
'col2': [4, 5, 6, 7],
'col3': [3, 8, 12, 10],
'col4': [17, 13, 16, 15],
'col5': [-4, -5, -6, -7]})
ray_df = from_pandas(pandas_df, 2)
by = [1, 2, 1, 2]
n = 1
ray_groupby = ray_df.groupby(by=by)
pandas_groupby = pandas_df.groupby(by=by)
ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
test_ngroups(ray_groupby, pandas_groupby)
test_skew(ray_groupby, pandas_groupby)
test_ffill(ray_groupby, pandas_groupby)
test_sem(ray_groupby, pandas_groupby)
test_mean(ray_groupby, pandas_groupby)
test_any(ray_groupby, pandas_groupby)
test_min(ray_groupby, pandas_groupby)
test_idxmax(ray_groupby, pandas_groupby)
test_ndim(ray_groupby, pandas_groupby)
test_cumsum(ray_groupby, pandas_groupby)
test_pct_change(ray_groupby, pandas_groupby)
test_cummax(ray_groupby, pandas_groupby)
apply_functions = [lambda df: df.sum(), lambda df: -df]
for func in apply_functions:
test_apply(ray_groupby, pandas_groupby, func)
test_dtypes(ray_groupby, pandas_groupby)
test_first(ray_groupby, pandas_groupby)
test_backfill(ray_groupby, pandas_groupby)
test_cummin(ray_groupby, pandas_groupby)
test_bfill(ray_groupby, pandas_groupby)
test_idxmin(ray_groupby, pandas_groupby)
test_prod(ray_groupby, pandas_groupby)
test_std(ray_groupby, pandas_groupby)
agg_functions = ['min', 'max']
for func in agg_functions:
test_agg(ray_groupby, pandas_groupby, func)
test_aggregate(ray_groupby, pandas_groupby, func)
test_last(ray_groupby, pandas_groupby)
test_mad(ray_groupby, pandas_groupby)
test_rank(ray_groupby, pandas_groupby)
test_max(ray_groupby, pandas_groupby)
test_var(ray_groupby, pandas_groupby)
test_len(ray_groupby, pandas_groupby)
test_sum(ray_groupby, pandas_groupby)
test_ngroup(ray_groupby, pandas_groupby)
test_nunique(ray_groupby, pandas_groupby)
test_median(ray_groupby, pandas_groupby)
test_head(ray_groupby, pandas_groupby, n)
test_cumprod(ray_groupby, pandas_groupby)
test_cov(ray_groupby, pandas_groupby)
transform_functions = [lambda df: df + 4, lambda df: -df - 10]
for func in transform_functions:
test_transform(ray_groupby, pandas_groupby, func)
pipe_functions = [lambda dfgb: dfgb.sum()]
for func in pipe_functions:
test_pipe(ray_groupby, pandas_groupby, func)
test_corr(ray_groupby, pandas_groupby)
test_fillna(ray_groupby, pandas_groupby)
test_count(ray_groupby, pandas_groupby)
test_tail(ray_groupby, pandas_groupby, n)
test_quantile(ray_groupby, pandas_groupby)
test_take(ray_groupby, pandas_groupby)
def test_single_group_row_groupby():
pandas_df = pandas.DataFrame({'col1': [0, 1, 2, 3],
'col2': [4, 5, 36, 7],
'col3': [3, 8, 12, 10],
'col4': [17, 3, 16, 15],
'col5': [-4, 5, -6, -7]})
ray_df = from_pandas(pandas_df, 2)
by = [1, 1, 1, 1]
n = 6
ray_groupby = ray_df.groupby(by=by)
pandas_groupby = pandas_df.groupby(by=by)
ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
test_ngroups(ray_groupby, pandas_groupby)
test_skew(ray_groupby, pandas_groupby)
test_ffill(ray_groupby, pandas_groupby)
test_sem(ray_groupby, pandas_groupby)
test_mean(ray_groupby, pandas_groupby)
test_any(ray_groupby, pandas_groupby)
test_min(ray_groupby, pandas_groupby)
test_idxmax(ray_groupby, pandas_groupby)
test_ndim(ray_groupby, pandas_groupby)
test_cumsum(ray_groupby, pandas_groupby)
test_pct_change(ray_groupby, pandas_groupby)
test_cummax(ray_groupby, pandas_groupby)
apply_functions = [lambda df: df.sum(), lambda df: -df]
for func in apply_functions:
test_apply(ray_groupby, pandas_groupby, func)
test_dtypes(ray_groupby, pandas_groupby)
test_first(ray_groupby, pandas_groupby)
test_backfill(ray_groupby, pandas_groupby)
test_cummin(ray_groupby, pandas_groupby)
test_bfill(ray_groupby, pandas_groupby)
test_idxmin(ray_groupby, pandas_groupby)
test_prod(ray_groupby, pandas_groupby)
test_std(ray_groupby, pandas_groupby)
agg_functions = ['min', 'max']
for func in agg_functions:
test_agg(ray_groupby, pandas_groupby, func)
test_aggregate(ray_groupby, pandas_groupby, func)
test_last(ray_groupby, pandas_groupby)
test_mad(ray_groupby, pandas_groupby)
test_rank(ray_groupby, pandas_groupby)
test_max(ray_groupby, pandas_groupby)
test_var(ray_groupby, pandas_groupby)
test_len(ray_groupby, pandas_groupby)
test_sum(ray_groupby, pandas_groupby)
test_ngroup(ray_groupby, pandas_groupby)
test_nunique(ray_groupby, pandas_groupby)
test_median(ray_groupby, pandas_groupby)
test_head(ray_groupby, pandas_groupby, n)
test_cumprod(ray_groupby, pandas_groupby)
test_cov(ray_groupby, pandas_groupby)
transform_functions = [lambda df: df + 4, lambda df: -df - 10]
for func in transform_functions:
test_transform(ray_groupby, pandas_groupby, func)
pipe_functions = [lambda dfgb: dfgb.sum()]
for func in pipe_functions:
test_pipe(ray_groupby, pandas_groupby, func)
test_corr(ray_groupby, pandas_groupby)
test_fillna(ray_groupby, pandas_groupby)
test_count(ray_groupby, pandas_groupby)
test_tail(ray_groupby, pandas_groupby, n)
test_quantile(ray_groupby, pandas_groupby)
test_take(ray_groupby, pandas_groupby)
def test_large_row_groupby():
pandas_df = pandas.DataFrame(np.random.randint(0, 8, size=(100, 4)),
columns=list('ABCD'))
ray_df = from_pandas(pandas_df, 2)
by = pandas_df['A'].tolist()
n = 4
ray_groupby = ray_df.groupby(by=by)
pandas_groupby = pandas_df.groupby(by=by)
ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
test_ngroups(ray_groupby, pandas_groupby)
test_skew(ray_groupby, pandas_groupby)
test_ffill(ray_groupby, pandas_groupby)
test_sem(ray_groupby, pandas_groupby)
test_mean(ray_groupby, pandas_groupby)
test_any(ray_groupby, pandas_groupby)
test_min(ray_groupby, pandas_groupby)
test_idxmax(ray_groupby, pandas_groupby)
test_ndim(ray_groupby, pandas_groupby)
test_cumsum(ray_groupby, pandas_groupby)
test_pct_change(ray_groupby, pandas_groupby)
test_cummax(ray_groupby, pandas_groupby)
apply_functions = [lambda df: df.sum(), lambda df: -df]
for func in apply_functions:
test_apply(ray_groupby, pandas_groupby, func)
test_dtypes(ray_groupby, pandas_groupby)
test_first(ray_groupby, pandas_groupby)
test_backfill(ray_groupby, pandas_groupby)
test_cummin(ray_groupby, pandas_groupby)
test_bfill(ray_groupby, pandas_groupby)
test_idxmin(ray_groupby, pandas_groupby)
# test_prod(ray_groupby, pandas_groupby) causes overflows
test_std(ray_groupby, pandas_groupby)
agg_functions = ['min', 'max']
for func in agg_functions:
test_agg(ray_groupby, pandas_groupby, func)
test_aggregate(ray_groupby, pandas_groupby, func)
test_last(ray_groupby, pandas_groupby)
test_mad(ray_groupby, pandas_groupby)
test_rank(ray_groupby, pandas_groupby)
test_max(ray_groupby, pandas_groupby)
test_var(ray_groupby, pandas_groupby)
test_len(ray_groupby, pandas_groupby)
test_sum(ray_groupby, pandas_groupby)
test_ngroup(ray_groupby, pandas_groupby)
test_nunique(ray_groupby, pandas_groupby)
test_median(ray_groupby, pandas_groupby)
test_head(ray_groupby, pandas_groupby, n)
# test_cumprod(ray_groupby, pandas_groupby) causes overflows
test_cov(ray_groupby, pandas_groupby)
transform_functions = [lambda df: df + 4, lambda df: -df - 10]
for func in transform_functions:
test_transform(ray_groupby, pandas_groupby, func)
pipe_functions = [lambda dfgb: dfgb.sum()]
for func in pipe_functions:
test_pipe(ray_groupby, pandas_groupby, func)
test_corr(ray_groupby, pandas_groupby)
test_fillna(ray_groupby, pandas_groupby)
test_count(ray_groupby, pandas_groupby)
test_tail(ray_groupby, pandas_groupby, n)
test_quantile(ray_groupby, pandas_groupby)
test_take(ray_groupby, pandas_groupby)
def test_simple_col_groupby():
pandas_df = pandas.DataFrame({'col1': [0, 3, 2, 3],
'col2': [4, 1, 6, 7],
'col3': [3, 8, 2, 10],
'col4': [1, 13, 6, 15],
'col5': [-4, 5, 6, -7]})
ray_df = from_pandas(pandas_df, 2)
by = [1, 2, 3, 2, 1]
ray_groupby = ray_df.groupby(axis=1, by=by)
pandas_groupby = pandas_df.groupby(axis=1, by=by)
ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
test_ngroups(ray_groupby, pandas_groupby)
test_skew(ray_groupby, pandas_groupby)
test_ffill(ray_groupby, pandas_groupby)
test_sem(ray_groupby, pandas_groupby)
test_mean(ray_groupby, pandas_groupby)
test_any(ray_groupby, pandas_groupby)
test_min(ray_groupby, pandas_groupby)
test_ndim(ray_groupby, pandas_groupby)
if not PY2:
# idxmax and idxmin fail on column groupby in pandas with python2
test_idxmax(ray_groupby, pandas_groupby)
test_idxmin(ray_groupby, pandas_groupby)
test_rank(ray_groupby, pandas_groupby)
test_quantile(ray_groupby, pandas_groupby)
# https://github.com/pandas-dev/pandas/issues/21127
# test_cumsum(ray_groupby, pandas_groupby)
# test_cummax(ray_groupby, pandas_groupby)
# test_cummin(ray_groupby, pandas_groupby)
# test_cumprod(ray_groupby, pandas_groupby)
test_pct_change(ray_groupby, pandas_groupby)
apply_functions = [lambda df: -df, lambda df: df.sum(axis=1)]
for func in apply_functions:
test_apply(ray_groupby, pandas_groupby, func)
test_first(ray_groupby, pandas_groupby)
test_backfill(ray_groupby, pandas_groupby)
test_bfill(ray_groupby, pandas_groupby)
test_prod(ray_groupby, pandas_groupby)
test_std(ray_groupby, pandas_groupby)
test_last(ray_groupby, pandas_groupby)
test_mad(ray_groupby, pandas_groupby)
test_max(ray_groupby, pandas_groupby)
test_var(ray_groupby, pandas_groupby)
test_len(ray_groupby, pandas_groupby)
test_sum(ray_groupby, pandas_groupby)
# Pandas fails on this case with ValueError
# test_ngroup(ray_groupby, pandas_groupby)
# test_nunique(ray_groupby, pandas_groupby)
test_median(ray_groupby, pandas_groupby)
test_cov(ray_groupby, pandas_groupby)
transform_functions = [lambda df: df + 4, lambda df: -df - 10]
for func in transform_functions:
test_transform(ray_groupby, pandas_groupby, func)
pipe_functions = [lambda dfgb: dfgb.sum()]
for func in pipe_functions:
test_pipe(ray_groupby, pandas_groupby, func)
test_corr(ray_groupby, pandas_groupby)
test_fillna(ray_groupby, pandas_groupby)
test_count(ray_groupby, pandas_groupby)
test_take(ray_groupby, pandas_groupby)
@pytest.fixture
def test_ngroups(ray_groupby, pandas_groupby):
assert ray_groupby.ngroups == pandas_groupby.ngroups
@pytest.fixture
def test_skew(ray_groupby, pandas_groupby):
ray_df_almost_equals_pandas(ray_groupby.skew(), pandas_groupby.skew())
@pytest.fixture
def test_ffill(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.ffill(), pandas_groupby.ffill())
@pytest.fixture
def test_sem(ray_groupby, pandas_groupby):
with pytest.raises(NotImplementedError):
ray_groupby.sem()
@pytest.fixture
def test_mean(ray_groupby, pandas_groupby):
ray_df_almost_equals_pandas(ray_groupby.mean(), pandas_groupby.mean())
@pytest.fixture
def test_any(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.any(), pandas_groupby.any())
@pytest.fixture
def test_min(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.min(), pandas_groupby.min())
@pytest.fixture
def test_idxmax(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.idxmax(), pandas_groupby.idxmax())
@pytest.fixture
def test_ndim(ray_groupby, pandas_groupby):
assert ray_groupby.ndim == pandas_groupby.ndim
@pytest.fixture
def test_cumsum(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.cumsum(), pandas_groupby.cumsum())
ray_df_equals_pandas(ray_groupby.cumsum(axis=1),
pandas_groupby.cumsum(axis=1))
@pytest.fixture
def test_pct_change(ray_groupby, pandas_groupby):
with pytest.raises(NotImplementedError):
ray_groupby.pct_change()
@pytest.fixture
def test_cummax(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.cummax(), pandas_groupby.cummax())
ray_df_equals_pandas(ray_groupby.cummax(axis=1),
pandas_groupby.cummax(axis=1))
@pytest.fixture
def test_apply(ray_groupby, pandas_groupby, func):
ray_df_equals_pandas(ray_groupby.apply(func), pandas_groupby.apply(func))
@pytest.fixture
def test_dtypes(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.dtypes, pandas_groupby.dtypes)
@pytest.fixture
def test_first(ray_groupby, pandas_groupby):
with pytest.raises(NotImplementedError):
ray_groupby.first()
@pytest.fixture
def test_backfill(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.backfill(), pandas_groupby.backfill())
@pytest.fixture
def test_cummin(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.cummin(), pandas_groupby.cummin())
ray_df_equals_pandas(ray_groupby.cummin(axis=1),
pandas_groupby.cummin(axis=1))
@pytest.fixture
def test_bfill(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.bfill(), pandas_groupby.bfill())
@pytest.fixture
def test_idxmin(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.idxmin(), pandas_groupby.idxmin())
@pytest.fixture
def test_prod(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.prod(), pandas_groupby.prod())
@pytest.fixture
def test_std(ray_groupby, pandas_groupby):
ray_df_almost_equals_pandas(ray_groupby.std(), pandas_groupby.std())
@pytest.fixture
def test_aggregate(ray_groupby, pandas_groupby, func):
ray_df_equals_pandas(ray_groupby.aggregate(func),
pandas_groupby.aggregate(func))
@pytest.fixture
def test_agg(ray_groupby, pandas_groupby, func):
ray_df_equals_pandas(ray_groupby.agg(func), pandas_groupby.agg(func))
@pytest.fixture
def test_last(ray_groupby, pandas_groupby):
with pytest.raises(NotImplementedError):
ray_groupby.last()
@pytest.fixture
def test_mad(ray_groupby, pandas_groupby):
with pytest.raises(NotImplementedError):
ray_groupby.mad()
@pytest.fixture
def test_rank(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.rank(), pandas_groupby.rank())
@pytest.fixture
def test_max(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.max(), pandas_groupby.max())
@pytest.fixture
def test_var(ray_groupby, pandas_groupby):
ray_df_almost_equals_pandas(ray_groupby.var(), pandas_groupby.var())
@pytest.fixture
def test_len(ray_groupby, pandas_groupby):
assert len(ray_groupby) == len(pandas_groupby)
@pytest.fixture
def test_sum(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.sum(), pandas_groupby.sum())
@pytest.fixture
def test_ngroup(ray_groupby, pandas_groupby):
ray_series_equals_pandas(ray_groupby.ngroup(), pandas_groupby.ngroup())
@pytest.fixture
def test_nunique(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.nunique(), pandas_groupby.nunique())
@pytest.fixture
def test_median(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.median(), pandas_groupby.median())
@pytest.fixture
def test_head(ray_groupby, pandas_groupby, n):
ray_df_equals_pandas(ray_groupby.head(n=n), pandas_groupby.head(n=n))
@pytest.fixture
def test_cumprod(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.cumprod(), pandas_groupby.cumprod())
ray_df_equals_pandas(ray_groupby.cumprod(axis=1),
pandas_groupby.cumprod(axis=1))
@pytest.fixture
def test_cov(ray_groupby, pandas_groupby):
with pytest.raises(NotImplementedError):
ray_groupby.cov()
@pytest.fixture
def test_transform(ray_groupby, pandas_groupby, func):
ray_df_equals_pandas(ray_groupby.transform(func),
pandas_groupby.transform(func))
@pytest.fixture
def test_corr(ray_groupby, pandas_groupby):
with pytest.raises(NotImplementedError):
ray_groupby.corr()
@pytest.fixture
def test_fillna(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.fillna(method="ffill"),
pandas_groupby.fillna(method="ffill"))
@pytest.fixture
def test_count(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.count(), pandas_groupby.count())
@pytest.fixture
def test_pipe(ray_groupby, pandas_groupby, func):
ray_df_equals_pandas(ray_groupby.pipe(func), pandas_groupby.pipe(func))
@pytest.fixture
def test_tail(ray_groupby, pandas_groupby, n):
ray_df_equals_pandas(ray_groupby.tail(n=n), pandas_groupby.tail(n=n))
@pytest.fixture
def test_quantile(ray_groupby, pandas_groupby):
ray_df_equals_pandas(ray_groupby.quantile(q=0.4),
pandas_groupby.quantile(q=0.4))
@pytest.fixture
def test_take(ray_groupby, pandas_groupby):
with pytest.raises(NotImplementedError):
ray_groupby.take(indices=[1])
-675
View File
@@ -1,675 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pytest
import numpy as np
import pandas
from ray.dataframe.utils import to_pandas
import ray.dataframe as pd
import os
import sqlite3
TEST_PARQUET_FILENAME = 'test.parquet'
TEST_CSV_FILENAME = 'test.csv'
TEST_JSON_FILENAME = 'test.json'
TEST_HTML_FILENAME = 'test.html'
TEST_EXCEL_FILENAME = 'test.xlsx'
TEST_FEATHER_FILENAME = 'test.feather'
TEST_HDF_FILENAME = 'test.hdf'
TEST_MSGPACK_FILENAME = 'test.msg'
TEST_STATA_FILENAME = 'test.dta'
TEST_PICKLE_FILENAME = 'test.pkl'
TEST_SAS_FILENAME = os.getcwd() + '/data/test1.sas7bdat'
TEST_SQL_FILENAME = 'test.db'
SMALL_ROW_SIZE = 2000
LARGE_ROW_SIZE = 7e6
@pytest.fixture
def ray_df_equals_pandas(ray_df, pandas_df):
return to_pandas(ray_df).sort_index().equals(pandas_df.sort_index())
@pytest.fixture
def setup_parquet_file(row_size, force=False):
if os.path.exists(TEST_PARQUET_FILENAME) and not force:
pass
else:
df = pandas.DataFrame({
'col1': np.arange(row_size),
'col2': np.arange(row_size)
})
df.to_parquet(TEST_PARQUET_FILENAME)
@pytest.fixture
def create_test_ray_dataframe():
df = pd.DataFrame({'col1': [0, 1, 2, 3],
'col2': [4, 5, 6, 7],
'col3': [8, 9, 10, 11],
'col4': [12, 13, 14, 15],
'col5': [0, 0, 0, 0]})
return df
@pytest.fixture
def create_test_pandas_dataframe():
df = pandas.DataFrame({'col1': [0, 1, 2, 3],
'col2': [4, 5, 6, 7],
'col3': [8, 9, 10, 11],
'col4': [12, 13, 14, 15],
'col5': [0, 0, 0, 0]})
return df
@pytest.fixture
def test_files_eq(path1, path2):
with open(path1, 'rb') as file1, open(path2, 'rb') as file2:
file1_content = file1.read()
file2_content = file2.read()
if file1_content == file2_content:
return True
else:
return False
@pytest.fixture
def teardown_test_file(test_path):
if os.path.exists(test_path):
os.remove(test_path)
@pytest.fixture
def teardown_parquet_file():
if os.path.exists(TEST_PARQUET_FILENAME):
os.remove(TEST_PARQUET_FILENAME)
@pytest.fixture
def setup_csv_file(row_size, force=False, delimiter=','):
if os.path.exists(TEST_CSV_FILENAME) and not force:
pass
else:
df = pandas.DataFrame({
'col1': np.arange(row_size),
'col2': np.arange(row_size)
})
df.to_csv(TEST_CSV_FILENAME, sep=delimiter)
@pytest.fixture
def teardown_csv_file():
if os.path.exists(TEST_CSV_FILENAME):
os.remove(TEST_CSV_FILENAME)
@pytest.fixture
def setup_json_file(row_size, force=False):
if os.path.exists(TEST_JSON_FILENAME) and not force:
pass
else:
df = pandas.DataFrame({
'col1': np.arange(row_size),
'col2': np.arange(row_size)
})
df.to_json(TEST_JSON_FILENAME)
@pytest.fixture
def teardown_json_file():
if os.path.exists(TEST_JSON_FILENAME):
os.remove(TEST_JSON_FILENAME)
@pytest.fixture
def setup_html_file(row_size, force=False):
if os.path.exists(TEST_HTML_FILENAME) and not force:
pass
else:
df = pandas.DataFrame({
'col1': np.arange(row_size),
'col2': np.arange(row_size)
})
df.to_html(TEST_HTML_FILENAME)
@pytest.fixture
def teardown_html_file():
if os.path.exists(TEST_HTML_FILENAME):
os.remove(TEST_HTML_FILENAME)
@pytest.fixture
def setup_clipboard(row_size, force=False):
df = pandas.DataFrame({
'col1': np.arange(row_size),
'col2': np.arange(row_size)
})
df.to_clipboard()
@pytest.fixture
def setup_excel_file(row_size, force=False):
if os.path.exists(TEST_EXCEL_FILENAME) and not force:
pass
else:
df = pandas.DataFrame({
'col1': np.arange(row_size),
'col2': np.arange(row_size)
})
df.to_excel(TEST_EXCEL_FILENAME)
@pytest.fixture
def teardown_excel_file():
if os.path.exists(TEST_EXCEL_FILENAME):
os.remove(TEST_EXCEL_FILENAME)
@pytest.fixture
def setup_feather_file(row_size, force=False):
if os.path.exists(TEST_FEATHER_FILENAME) and not force:
pass
else:
df = pandas.DataFrame({
'col1': np.arange(row_size),
'col2': np.arange(row_size)
})
df.to_feather(TEST_FEATHER_FILENAME)
@pytest.fixture
def teardown_feather_file():
if os.path.exists(TEST_FEATHER_FILENAME):
os.remove(TEST_FEATHER_FILENAME)
@pytest.fixture
def setup_hdf_file(row_size, force=False):
if os.path.exists(TEST_HDF_FILENAME) and not force:
pass
else:
df = pandas.DataFrame({
'col1': np.arange(row_size),
'col2': np.arange(row_size)
})
df.to_hdf(TEST_HDF_FILENAME, 'test')
@pytest.fixture
def teardown_hdf_file():
if os.path.exists(TEST_HDF_FILENAME):
os.remove(TEST_HDF_FILENAME)
@pytest.fixture
def setup_msgpack_file(row_size, force=False):
if os.path.exists(TEST_MSGPACK_FILENAME) and not force:
pass
else:
df = pandas.DataFrame({
'col1': np.arange(row_size),
'col2': np.arange(row_size)
})
df.to_msgpack(TEST_MSGPACK_FILENAME)
@pytest.fixture
def teardown_msgpack_file():
if os.path.exists(TEST_MSGPACK_FILENAME):
os.remove(TEST_MSGPACK_FILENAME)
@pytest.fixture
def setup_stata_file(row_size, force=False):
if os.path.exists(TEST_STATA_FILENAME) and not force:
pass
else:
df = pandas.DataFrame({
'col1': np.arange(row_size),
'col2': np.arange(row_size)
})
df.to_stata(TEST_STATA_FILENAME)
@pytest.fixture
def teardown_stata_file():
if os.path.exists(TEST_STATA_FILENAME):
os.remove(TEST_STATA_FILENAME)
@pytest.fixture
def setup_pickle_file(row_size, force=False):
if os.path.exists(TEST_PICKLE_FILENAME) and not force:
pass
else:
df = pandas.DataFrame({
'col1': np.arange(row_size),
'col2': np.arange(row_size)
})
df.to_pickle(TEST_PICKLE_FILENAME)
@pytest.fixture
def teardown_pickle_file():
if os.path.exists(TEST_PICKLE_FILENAME):
os.remove(TEST_PICKLE_FILENAME)
@pytest.fixture
def setup_sql_file(conn, force=False):
if os.path.exists(TEST_SQL_FILENAME) and not force:
pass
else:
df = pandas.DataFrame({'col1': [0, 1, 2, 3],
'col2': [4, 5, 6, 7],
'col3': [8, 9, 10, 11],
'col4': [12, 13, 14, 15],
'col5': [0, 0, 0, 0]})
df.to_sql(TEST_SQL_FILENAME.split(".")[0], conn)
@pytest.fixture
def teardown_sql_file():
if os.path.exists(TEST_SQL_FILENAME):
os.remove(TEST_SQL_FILENAME)
def test_from_parquet_small():
setup_parquet_file(SMALL_ROW_SIZE)
pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
ray_df = pd.read_parquet(TEST_PARQUET_FILENAME)
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_parquet_file()
def test_from_parquet_large():
setup_parquet_file(LARGE_ROW_SIZE)
pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
ray_df = pd.read_parquet(TEST_PARQUET_FILENAME)
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_parquet_file()
def test_from_csv():
setup_csv_file(SMALL_ROW_SIZE)
pandas_df = pandas.read_csv(TEST_CSV_FILENAME)
ray_df = pd.read_csv(TEST_CSV_FILENAME)
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_csv_file()
def test_from_json():
setup_json_file(SMALL_ROW_SIZE)
pandas_df = pandas.read_json(TEST_JSON_FILENAME)
ray_df = pd.read_json(TEST_JSON_FILENAME)
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_json_file()
def test_from_html():
setup_html_file(SMALL_ROW_SIZE)
pandas_df = pandas.read_html(TEST_HTML_FILENAME)[0]
ray_df = pd.read_html(TEST_HTML_FILENAME)
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_html_file()
@pytest.mark.skip(reason="No clipboard on Travis")
def test_from_clipboard():
setup_clipboard(SMALL_ROW_SIZE)
pandas_df = pandas.read_clipboard()
ray_df = pd.read_clipboard()
assert ray_df_equals_pandas(ray_df, pandas_df)
def test_from_excel():
setup_excel_file(SMALL_ROW_SIZE)
pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME)
ray_df = pd.read_excel(TEST_EXCEL_FILENAME)
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_excel_file()
def test_from_feather():
setup_feather_file(SMALL_ROW_SIZE)
pandas_df = pandas.read_feather(TEST_FEATHER_FILENAME)
ray_df = pd.read_feather(TEST_FEATHER_FILENAME)
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_feather_file()
@pytest.mark.skip(reason="Memory overflow on Travis")
def test_from_hdf():
setup_hdf_file(SMALL_ROW_SIZE)
pandas_df = pandas.read_hdf(TEST_HDF_FILENAME, key='test')
ray_df = pd.read_hdf(TEST_HDF_FILENAME, key='test')
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_hdf_file()
def test_from_msgpack():
setup_msgpack_file(SMALL_ROW_SIZE)
pandas_df = pandas.read_msgpack(TEST_MSGPACK_FILENAME)
ray_df = pd.read_msgpack(TEST_MSGPACK_FILENAME)
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_msgpack_file()
def test_from_stata():
setup_stata_file(SMALL_ROW_SIZE)
pandas_df = pandas.read_stata(TEST_STATA_FILENAME)
ray_df = pd.read_stata(TEST_STATA_FILENAME)
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_stata_file()
def test_from_pickle():
setup_pickle_file(SMALL_ROW_SIZE)
pandas_df = pandas.read_pickle(TEST_PICKLE_FILENAME)
ray_df = pd.read_pickle(TEST_PICKLE_FILENAME)
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_pickle_file()
def test_from_sql():
conn = sqlite3.connect(TEST_SQL_FILENAME)
setup_sql_file(conn, True)
pandas_df = pandas.read_sql("select * from test", conn)
ray_df = pd.read_sql("select * from test", conn)
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_sql_file()
@pytest.mark.skip(reason="No SAS write methods in Pandas")
def test_from_sas():
pandas_df = pandas.read_sas(TEST_SAS_FILENAME)
ray_df = pd.read_sas(TEST_SAS_FILENAME)
assert ray_df_equals_pandas(ray_df, pandas_df)
def test_from_csv_delimiter():
setup_csv_file(SMALL_ROW_SIZE, delimiter='|')
pandas_df = pandas.read_csv(TEST_CSV_FILENAME)
ray_df = pd.read_csv(TEST_CSV_FILENAME)
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_csv_file()
@pytest.mark.skip(reason="No clipboard on Travis")
def test_to_clipboard():
ray_df = create_test_ray_dataframe()
pandas_df = create_test_pandas_dataframe()
ray_df.to_clipboard()
ray_as_clip = pandas.read_clipboard()
pandas_df.to_clipboard()
pandas_as_clip = pandas.read_clipboard()
assert(ray_as_clip.equals(pandas_as_clip))
def test_to_csv():
ray_df = create_test_ray_dataframe()
pandas_df = create_test_pandas_dataframe()
TEST_CSV_DF_FILENAME = "test_df.csv"
TEST_CSV_pandas_FILENAME = "test_pandas.csv"
ray_df.to_csv(TEST_CSV_DF_FILENAME)
pandas_df.to_csv(TEST_CSV_pandas_FILENAME)
assert(test_files_eq(TEST_CSV_DF_FILENAME,
TEST_CSV_pandas_FILENAME))
teardown_test_file(TEST_CSV_pandas_FILENAME)
teardown_test_file(TEST_CSV_DF_FILENAME)
def test_to_dense():
ray_df = create_test_ray_dataframe()
with pytest.raises(NotImplementedError):
ray_df.to_dense()
def test_to_dict():
ray_df = create_test_ray_dataframe()
with pytest.raises(NotImplementedError):
ray_df.to_dict()
def test_to_excel():
ray_df = create_test_ray_dataframe()
pandas_df = create_test_pandas_dataframe()
TEST_EXCEL_DF_FILENAME = "test_df.xlsx"
TEST_EXCEL_pandas_FILENAME = "test_pandas.xlsx"
ray_writer = pandas.ExcelWriter(TEST_EXCEL_DF_FILENAME)
pandas_writer = pandas.ExcelWriter(TEST_EXCEL_pandas_FILENAME)
ray_df.to_excel(ray_writer)
pandas_df.to_excel(pandas_writer)
ray_writer.save()
pandas_writer.save()
assert(test_files_eq(TEST_EXCEL_DF_FILENAME,
TEST_EXCEL_pandas_FILENAME))
teardown_test_file(TEST_EXCEL_DF_FILENAME)
teardown_test_file(TEST_EXCEL_pandas_FILENAME)
def test_to_feather():
ray_df = create_test_ray_dataframe()
pandas_df = create_test_pandas_dataframe()
TEST_FEATHER_DF_FILENAME = "test_df.feather"
TEST_FEATHER_pandas_FILENAME = "test_pandas.feather"
ray_df.to_feather(TEST_FEATHER_DF_FILENAME)
pandas_df.to_feather(TEST_FEATHER_pandas_FILENAME)
assert(test_files_eq(TEST_FEATHER_DF_FILENAME,
TEST_FEATHER_pandas_FILENAME))
teardown_test_file(TEST_FEATHER_pandas_FILENAME)
teardown_test_file(TEST_FEATHER_DF_FILENAME)
def test_to_gbq():
ray_df = create_test_ray_dataframe()
TEST_GBQ_DF_FILENAME = "test_df.gbq"
with pytest.raises(NotImplementedError):
ray_df.to_gbq(TEST_GBQ_DF_FILENAME, None)
def test_to_html():
ray_df = create_test_ray_dataframe()
pandas_df = create_test_pandas_dataframe()
TEST_HTML_DF_FILENAME = "test_df.html"
TEST_HTML_pandas_FILENAME = "test_pandas.html"
ray_df.to_html(TEST_HTML_DF_FILENAME)
pandas_df.to_html(TEST_HTML_pandas_FILENAME)
assert(test_files_eq(TEST_HTML_DF_FILENAME,
TEST_HTML_pandas_FILENAME))
teardown_test_file(TEST_HTML_pandas_FILENAME)
teardown_test_file(TEST_HTML_DF_FILENAME)
def test_to_json():
ray_df = create_test_ray_dataframe()
pandas_df = create_test_pandas_dataframe()
TEST_JSON_DF_FILENAME = "test_df.json"
TEST_JSON_pandas_FILENAME = "test_pandas.json"
ray_df.to_json(TEST_JSON_DF_FILENAME)
pandas_df.to_json(TEST_JSON_pandas_FILENAME)
assert(test_files_eq(TEST_JSON_DF_FILENAME,
TEST_JSON_pandas_FILENAME))
teardown_test_file(TEST_JSON_pandas_FILENAME)
teardown_test_file(TEST_JSON_DF_FILENAME)
def test_to_latex():
ray_df = create_test_ray_dataframe()
with pytest.raises(NotImplementedError):
ray_df.to_latex()
def test_to_msgpack():
ray_df = create_test_ray_dataframe()
pandas_df = create_test_pandas_dataframe()
TEST_MSGPACK_DF_FILENAME = "test_df.msgpack"
TEST_MSGPACK_pandas_FILENAME = "test_pandas.msgpack"
ray_df.to_msgpack(TEST_MSGPACK_DF_FILENAME)
pandas_df.to_msgpack(TEST_MSGPACK_pandas_FILENAME)
assert(test_files_eq(TEST_MSGPACK_DF_FILENAME,
TEST_MSGPACK_pandas_FILENAME))
teardown_test_file(TEST_MSGPACK_pandas_FILENAME)
teardown_test_file(TEST_MSGPACK_DF_FILENAME)
def test_to_panel():
ray_df = create_test_ray_dataframe()
with pytest.raises(NotImplementedError):
ray_df.to_panel()
def test_to_parquet():
ray_df = create_test_ray_dataframe()
pandas_df = create_test_pandas_dataframe()
TEST_PARQUET_DF_FILENAME = "test_df.parquet"
TEST_PARQUET_pandas_FILENAME = "test_pandas.parquet"
ray_df.to_parquet(TEST_PARQUET_DF_FILENAME)
pandas_df.to_parquet(TEST_PARQUET_pandas_FILENAME)
assert(test_files_eq(TEST_PARQUET_DF_FILENAME,
TEST_PARQUET_pandas_FILENAME))
teardown_test_file(TEST_PARQUET_pandas_FILENAME)
teardown_test_file(TEST_PARQUET_DF_FILENAME)
def test_to_period():
ray_df = create_test_ray_dataframe()
with pytest.raises(NotImplementedError):
ray_df.to_period()
def test_to_pickle():
ray_df = create_test_ray_dataframe()
pandas_df = create_test_pandas_dataframe()
TEST_PICKLE_DF_FILENAME = "test_df.pkl"
TEST_PICKLE_pandas_FILENAME = "test_pandas.pkl"
ray_df.to_pickle(TEST_PICKLE_DF_FILENAME)
pandas_df.to_pickle(TEST_PICKLE_pandas_FILENAME)
assert(test_files_eq(TEST_PICKLE_DF_FILENAME,
TEST_PICKLE_pandas_FILENAME))
teardown_test_file(TEST_PICKLE_pandas_FILENAME)
teardown_test_file(TEST_PICKLE_DF_FILENAME)
def test_to_sql():
ray_df = create_test_ray_dataframe()
pandas_df = create_test_pandas_dataframe()
TEST_SQL_DF_FILENAME = "test_df.sql"
TEST_SQL_pandas_FILENAME = "test_pandas.sql"
ray_df.to_pickle(TEST_SQL_DF_FILENAME)
pandas_df.to_pickle(TEST_SQL_pandas_FILENAME)
assert(test_files_eq(TEST_SQL_DF_FILENAME,
TEST_SQL_pandas_FILENAME))
teardown_test_file(TEST_SQL_DF_FILENAME)
teardown_test_file(TEST_SQL_pandas_FILENAME)
def test_to_stata():
ray_df = create_test_ray_dataframe()
pandas_df = create_test_pandas_dataframe()
TEST_STATA_DF_FILENAME = "test_df.stata"
TEST_STATA_pandas_FILENAME = "test_pandas.stata"
ray_df.to_stata(TEST_STATA_DF_FILENAME)
pandas_df.to_stata(TEST_STATA_pandas_FILENAME)
assert(test_files_eq(TEST_STATA_DF_FILENAME,
TEST_STATA_pandas_FILENAME))
teardown_test_file(TEST_STATA_pandas_FILENAME)
teardown_test_file(TEST_STATA_DF_FILENAME)
File diff suppressed because it is too large Load Diff
-577
View File
@@ -1,577 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import pandas
import numpy as np
import ray
from . import get_npartitions
_NAN_BLOCKS = {}
_MEMOIZER_CAPACITY = 1000 # Capacity per function
class LRUCache:
"""A LRUCache implemented with collections.OrderedDict
Notes:
- OrderedDict will record the order each item is inserted.
- The head of the queue will be LRU items.
"""
def __init__(self, capacity):
self.capacity = capacity
self.cache = collections.OrderedDict()
def __contains__(self, key):
return key in self.cache
def __getitem__(self, key):
"""Retrieve item from cache and re-insert it to the back of the queue
"""
value = self.cache.pop(key)
self.cache[key] = value
return value
def __setitem__(self, key, value):
if key in self.cache:
self.cache.pop(key)
if len(self.cache) >= self.capacity:
# Pop oldest items at the beginning of the queue
self.cache.popitem(last=False)
self.cache[key] = value
class memoize:
"""A basic memoizer that cache the input and output of the remote function
Notes:
- How is this implemented?
This meoizer is implemented by adding a caching layer to the remote
function's remote attribute. When user call f.remote(*args), we will
first check against the cache, and then call the ray remote function
if we can't find the return value in the cache.
- When should this be used?
This should be used when we anticipate temporal locality for the
function. For example, we can reasonally assume users will perform
columnar operation repetitively over time (like sum() or loc[]).
- Caveat
Don't use this decorator if the any argument to the remote function
will mutate. Following snippet will fail
```py
@memoize
@ray.remote
def f(obj):
...
mutable_obj = [1]
oid_1 = f.remote(mutable_obj) # will be cached
mutable_obj.append(3)
oid_2 = f.remote(mutable_obj) # cache hit!
oid_1 == oid_2 # True!
```
In short, use this function sparingly. The ideal case is that all
inputs are ray ObjectIDs because they are immutable objects.
- Future Development
- Fix the mutability bug
- Dynamic cache size (Fixed as 1000 for now)
"""
def __init__(self, f):
# Save of remote function
self.old_remote_func = f.remote
self.cache = LRUCache(capacity=_MEMOIZER_CAPACITY)
def remote(self, *args):
"""Return cached result if the arguments are cached
"""
args = tuple(args)
if args in self.cache:
cached_result = self.cache[args]
return cached_result
result = self.old_remote_func(*args)
self.cache[args] = result
return result
def _get_nan_block_id(n_row=1, n_col=1, transpose=False):
"""A memory efficent way to get a block of NaNs.
Args:
n_rows(int): number of rows
n_col(int): number of columns
transpose(bool): if true, swap rows and columns
Returns:
ObjectID of the NaN block
"""
global _NAN_BLOCKS
if transpose:
n_row, n_col = n_col, n_row
shape = (n_row, n_col)
if shape not in _NAN_BLOCKS:
arr = np.tile(np.array(np.NaN), shape)
_NAN_BLOCKS[shape] = ray.put(pandas.DataFrame(data=arr))
return _NAN_BLOCKS[shape]
def _get_lengths(df):
"""Gets the length of the dataframe.
Args:
df: A remote pandas.DataFrame object.
Returns:
Returns an integer length of the dataframe object. If the attempt
fails, returns 0 as the length.
"""
try:
return len(df)
# Because we sometimes have cases where we have summary statistics in our
# DataFrames
except TypeError:
return 0
def _get_widths(df):
"""Gets the width (number of columns) of the dataframe.
Args:
df: A remote pandas.DataFrame object.
Returns:
Returns an integer width of the dataframe object. If the attempt
fails, returns 0 as the length.
"""
try:
return len(df.columns)
# Because we sometimes have cases where we have summary statistics in our
# DataFrames
except TypeError:
return 0
def _partition_pandas_dataframe(df, num_partitions=None, row_chunksize=None):
"""Partitions a Pandas DataFrame object.
Args:
df (pandas.DataFrame): The pandas DataFrame to convert.
npartitions (int): The number of partitions to split the DataFrame
into. Has priority over chunksize.
row_chunksize (int): The number of rows to put in each partition.
Returns:
[ObjectID]: A list of object IDs corresponding to the dataframe
partitions
"""
if num_partitions is not None:
row_chunksize = len(df) // num_partitions \
if len(df) % num_partitions == 0 \
else len(df) // num_partitions + 1
else:
assert row_chunksize is not None
temp_df = df
row_partitions = []
while len(temp_df) > row_chunksize:
t_df = temp_df[:row_chunksize]
# reset_index here because we want a pandas.RangeIndex
# within the partitions. It is smaller and sometimes faster.
t_df.reset_index(drop=True, inplace=True)
t_df.columns = pandas.RangeIndex(0, len(t_df.columns))
top = ray.put(t_df)
row_partitions.append(top)
temp_df = temp_df[row_chunksize:]
else:
# Handle the last chunk correctly.
# This call is necessary to prevent modifying original df
temp_df = temp_df[:]
temp_df.reset_index(drop=True, inplace=True)
temp_df.columns = pandas.RangeIndex(0, len(temp_df.columns))
row_partitions.append(ray.put(temp_df))
return row_partitions
def from_pandas(df, num_partitions=None, chunksize=None):
"""Converts a pandas DataFrame to a Ray DataFrame.
Args:
df (pandas.DataFrame): The pandas DataFrame to convert.
num_partitions (int): The number of partitions to split the DataFrame
into. Has priority over chunksize.
chunksize (int): The number of rows to put in each partition.
Returns:
A new Ray DataFrame object.
"""
from .dataframe import DataFrame
row_partitions = \
_partition_pandas_dataframe(df, num_partitions, chunksize)
return DataFrame(row_partitions=row_partitions,
columns=df.columns,
index=df.index)
def to_pandas(df):
"""Converts a Ray DataFrame to a pandas DataFrame/Series.
Args:
df (ray.DataFrame): The Ray DataFrame to convert.
Returns:
A new pandas DataFrame.
"""
pandas_df = pandas.concat(ray.get(df._row_partitions), copy=False)
pandas_df.index = df.index
pandas_df.columns = df.columns
return pandas_df
@ray.remote
def extractor(df_chunk, row_loc, col_loc):
"""Retrieve an item from remote block
"""
# We currently have to do the writable flag trick because a pandas bug
# https://github.com/pandas-dev/pandas/issues/17192
try:
row_loc.flags.writeable = True
col_loc.flags.writeable = True
except AttributeError:
# Locators might be scaler or python list
pass
return df_chunk.iloc[row_loc, col_loc]
@ray.remote
def writer(df_chunk, row_loc, col_loc, item):
"""Make a copy of the block and write new item to it
"""
df_chunk = df_chunk.copy()
df_chunk.iloc[row_loc, col_loc] = item
return df_chunk
def _mask_block_partitions(blk_partitions, row_metadata, col_metadata):
"""Return the squeezed/expanded block partitions as defined by
row_metadata and col_metadata.
Note:
Very naive implementation. Extract one scaler at a time in a double
for loop.
"""
col_df = col_metadata._coord_df
row_df = row_metadata._coord_df
result_oids = []
shape = (len(row_df.index), len(col_df.index))
for _, row_partition_data in row_df.iterrows():
for _, col_partition_data in col_df.iterrows():
row_part = row_partition_data.partition
col_part = col_partition_data.partition
block_oid = blk_partitions[row_part, col_part]
row_idx = row_partition_data['index_within_partition']
col_idx = col_partition_data['index_within_partition']
result_oid = extractor.remote(block_oid, [row_idx], [col_idx])
result_oids.append(result_oid)
return np.array(result_oids).reshape(shape)
@ray.remote
def _deploy_func(func, dataframe, *args):
"""Deploys a function for the _map_partitions call.
Args:
dataframe (pandas.DataFrame): The pandas DataFrame for this partition.
Returns:
A futures object representing the return value of the function
provided.
"""
if len(args) == 0:
return func(dataframe)
else:
return func(dataframe, *args)
def _map_partitions(func, partitions, *argslists):
"""Apply a function across the specified axis
Args:
func (callable): The function to apply
partitions ([ObjectID]): The list of partitions to map func on.
Returns:
A list of partitions ([ObjectID]) with the result of the function
"""
if partitions is None:
return None
assert(callable(func))
if len(argslists) == 0:
return [_deploy_func.remote(func, part) for part in partitions]
elif len(argslists) == 1:
return [_deploy_func.remote(func, part, argslists[0])
for part in partitions]
else:
assert(all(len(args) == len(partitions) for args in argslists))
return [_deploy_func.remote(func, *args)
for args in zip(partitions, *argslists)]
@ray.remote
def _build_col_widths(df_col):
"""Compute widths (# of columns) for each partition."""
widths = np.array(ray.get([_deploy_func.remote(_get_widths, d)
for d in df_col]))
return widths
@ray.remote
def _build_row_lengths(df_row):
"""Compute lengths (# of rows) for each partition."""
lengths = np.array(ray.get([_deploy_func.remote(_get_lengths, d)
for d in df_row]))
return lengths
@ray.remote
def _build_coord_df(lengths, index):
"""Build the coordinate dataframe over all partitions."""
filtered_lengths = [x for x in lengths if x > 0]
coords = None
if len(filtered_lengths) > 0:
coords = np.vstack([np.column_stack((np.full(l, i), np.arange(l)))
for i, l in enumerate(filtered_lengths)])
col_names = ("partition", "index_within_partition")
return pandas.DataFrame(coords, index=index, columns=col_names)
def _create_block_partitions(partitions, axis=0, length=None):
if length is not None and length != 0 and get_npartitions() > length:
npartitions = length
elif length == 0:
npartitions = 1
else:
npartitions = get_npartitions()
x = [create_blocks._submit(args=(partition, npartitions, axis),
num_return_vals=npartitions)
for partition in partitions]
# In the case that axis is 1 we have to transpose because we build the
# columns into rows. Fortunately numpy is efficient at this.
blocks = np.array(x) if axis == 0 else np.array(x).T
# Sometimes we only get a single column or row, which is
# problematic for building blocks from the partitions, so we
# add whatever dimension we're missing from the input.
return fix_blocks_dimensions(blocks, axis)
@ray.remote
def create_blocks(df, npartitions, axis):
return create_blocks_helper(df, npartitions, axis)
def create_blocks_helper(df, npartitions, axis):
# Single partition dataframes don't need to be repartitioned
if npartitions == 1:
return df
# In the case that the size is not a multiple of the number of partitions,
# we need to add one to each partition to avoid losing data off the end
block_size = df.shape[axis ^ 1] // npartitions \
if df.shape[axis ^ 1] % npartitions == 0 \
else df.shape[axis ^ 1] // npartitions + 1
# if not isinstance(df.columns, pandas.RangeIndex):
# df.columns = pandas.RangeIndex(0, len(df.columns))
blocks = [df.iloc[:, i * block_size: (i + 1) * block_size]
if axis == 0
else df.iloc[i * block_size: (i + 1) * block_size, :]
for i in range(npartitions)]
for block in blocks:
block.columns = pandas.RangeIndex(0, len(block.columns))
block.reset_index(inplace=True, drop=True)
return blocks
@memoize
@ray.remote
def _blocks_to_col(*partition):
if len(partition):
return pandas.concat(partition, axis=0, copy=False)\
.reset_index(drop=True)
else:
return pandas.Series()
@memoize
@ray.remote
def _blocks_to_row(*partition):
row_part = pandas.concat(partition, axis=1, copy=False)\
.reset_index(drop=True)
# Because our block partitions contain different indices (for the
# columns), this change is needed to ensure correctness.
row_part.columns = pandas.RangeIndex(0, len(row_part.columns))
return row_part
def _inherit_docstrings(parent, excluded=[]):
"""Creates a decorator which overwrites a decorated class' __doc__
attribute with parent's __doc__ attribute. Also overwrites __doc__ of
methods and properties defined in the class with the __doc__ of matching
methods and properties in parent.
Args:
parent (object): Class from which the decorated class inherits __doc__.
excluded (list): List of parent objects from which the class does not
inherit docstrings.
Returns:
function: decorator which replaces the decorated class' documentation
parent's documentation.
"""
def decorator(cls):
if parent not in excluded:
cls.__doc__ = parent.__doc__
for attr, obj in cls.__dict__.items():
parent_obj = getattr(parent, attr, None)
if parent_obj in excluded or \
(not callable(parent_obj) and
not isinstance(parent_obj, property)):
continue
if callable(obj):
obj.__doc__ = parent_obj.__doc__
elif isinstance(obj, property) and obj.fget is not None:
p = property(obj.fget, obj.fset, obj.fdel, parent_obj.__doc__)
setattr(cls, attr, p)
return cls
return decorator
@ray.remote
def _reindex_helper(old_index, new_index, axis, npartitions, *df):
"""Reindexes a dataframe to prepare for join/concat.
Args:
df: The DataFrame partition
old_index: The index/column for this partition.
new_index: The new index/column to assign.
axis: Which axis to reindex over.
Returns:
A new set of blocks made up of DataFrames.
"""
df = pandas.concat(df, axis=axis ^ 1)
if axis == 1:
df.index = old_index
elif axis == 0:
df.columns = old_index
df = df.reindex(new_index, copy=False, axis=axis ^ 1)
return create_blocks_helper(df, npartitions, axis)
@ray.remote
def _co_op_helper(func, left_columns, right_columns, left_df_len, left_idx,
*zipped):
"""Copartition operation where two DataFrames must have aligned indexes.
NOTE: This function assumes things are already copartitioned. Requires that
row partitions are passed in as blocks.
Args:
func: The operation to conduct between two DataFrames.
left_columns: The column names for the left DataFrame.
right_columns: The column names for the right DataFrame.
left_df_len: The length of the left. This is used so we can split up
the zipped partitions.
zipped: The DataFrame partitions (in blocks).
Returns:
A new set of blocks for the partitioned DataFrame.
"""
left = pandas.concat(zipped[:left_df_len], axis=1, copy=False).copy()
left.columns = left_columns
if left_idx is not None:
left.index = left_idx
right = pandas.concat(zipped[left_df_len:], axis=1, copy=False).copy()
right.columns = right_columns
new_rows = func(left, right)
new_blocks = create_blocks_helper(new_rows, left_df_len, 0)
if left_idx is not None:
new_blocks.append(new_rows.index)
return new_blocks
@ray.remote
def _match_partitioning(column_partition, lengths, index):
"""Match the number of rows on each partition. Used in df.merge().
NOTE: This function can cause problems when there are empty column
partitions.
The way this function is intended to be used is as follows: Align the
right partitioning with the left. The left will remain unchanged. Then,
you are free to perform actions on a per-partition basis with the
partitioning.
The index objects must already be identical for this to work correctly.
Args:
column_partition: The column partition to change.
lengths: The lengths of each row partition to match to.
index: The index index of the column_partition. This is used to push
down to the inner frame for correctness in the merge.
Returns:
A list of blocks created from this column partition.
"""
partitioned_list = []
columns = column_partition.columns
# We set this because this is the only place we can guarantee correct
# placement. We use it in the case the user wants to join on the index.
column_partition.index = index
for length in lengths:
if len(column_partition) == 0:
partitioned_list.append(pandas.DataFrame(columns=columns))
continue
partitioned_list.append(column_partition.iloc[:length, :])
column_partition = column_partition.iloc[length:, :]
return partitioned_list
@ray.remote
def _concat_index(*index_parts):
return index_parts[0].append(index_parts[1:])
def fix_blocks_dimensions(blocks, axis):
"""Checks that blocks is 2D, and adds a dimension if not.
"""
if blocks.ndim < 2:
return np.expand_dims(blocks, axis=axis ^ 1)
return blocks
@ray.remote
def _compile_remote_dtypes(*column_of_blocks):
small_dfs = [df.loc[0:0] for df in column_of_blocks]
return pandas.concat(small_dfs, copy=False).dtypes