mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 02:46:49 +08:00
12da021717
* Implement df.as_matrix * Addressing comments * Addressing comments
5067 lines
194 KiB
Python
5067 lines
194 KiB
Python
from __future__ import absolute_import
|
||
from __future__ import division
|
||
from __future__ import print_function
|
||
|
||
import pandas as pd
|
||
import functools
|
||
from pandas.api.types import is_scalar
|
||
from pandas.util._validators import validate_bool_kwarg
|
||
from pandas.core.index import _ensure_index_from_sequences
|
||
from pandas._libs import lib
|
||
from pandas.core.dtypes.cast import maybe_upcast_putmask
|
||
from pandas import compat
|
||
from pandas.compat import lzip, to_str, string_types, cPickle as pkl
|
||
import pandas.core.common as com
|
||
from pandas.core.dtypes.common import (
|
||
is_bool_dtype,
|
||
is_list_like,
|
||
is_numeric_dtype,
|
||
is_timedelta64_dtype,
|
||
_get_dtype_from_object)
|
||
from pandas.core.indexing import check_bool_indexer
|
||
from pandas.errors import MergeError
|
||
|
||
import warnings
|
||
import numpy as np
|
||
import ray
|
||
import itertools
|
||
import io
|
||
import sys
|
||
import re
|
||
|
||
from .groupby import DataFrameGroupBy
|
||
from .utils import (
|
||
_deploy_func,
|
||
_map_partitions,
|
||
_partition_pandas_dataframe,
|
||
to_pandas,
|
||
_blocks_to_col,
|
||
_blocks_to_row,
|
||
_create_block_partitions,
|
||
_inherit_docstrings,
|
||
_reindex_helper,
|
||
_co_op_helper,
|
||
_match_partitioning,
|
||
_concat_index,
|
||
_correct_column_dtypes)
|
||
from . import get_npartitions
|
||
from .index_metadata import _IndexMetadata
|
||
|
||
|
||
@_inherit_docstrings(pd.DataFrame)
|
||
class DataFrame(object):
|
||
|
||
def __init__(self, data=None, index=None, columns=None, dtype=None,
|
||
copy=False, col_partitions=None, row_partitions=None,
|
||
block_partitions=None, row_metadata=None, col_metadata=None,
|
||
dtypes_cache=None):
|
||
"""Distributed DataFrame object backed by Pandas dataframes.
|
||
|
||
Args:
|
||
data (numpy ndarray (structured or homogeneous) or dict):
|
||
Dict can contain Series, arrays, constants, or list-like
|
||
objects.
|
||
index (pandas.Index or list): The row index for this dataframe.
|
||
columns (pandas.Index): The column names for this dataframe, in
|
||
pandas Index object.
|
||
dtype: Data type to force. Only a single dtype is allowed.
|
||
If None, infer
|
||
copy (boolean): Copy data from inputs.
|
||
Only affects DataFrame / 2d ndarray input
|
||
col_partitions ([ObjectID]): The list of ObjectIDs that contain
|
||
the column dataframe partitions.
|
||
row_partitions ([ObjectID]): The list of ObjectIDs that contain the
|
||
row dataframe partitions.
|
||
block_partitions: A 2D numpy array of block partitions.
|
||
row_metadata (_IndexMetadata):
|
||
Metadata for the new dataframe's rows
|
||
col_metadata (_IndexMetadata):
|
||
Metadata for the new dataframe's columns
|
||
"""
|
||
self._dtypes_cache = dtypes_cache
|
||
|
||
# Check type of data and use appropriate constructor
|
||
if data is not None or (col_partitions is None and
|
||
row_partitions is None and
|
||
block_partitions is None):
|
||
|
||
pd_df = pd.DataFrame(data=data, index=index, columns=columns,
|
||
dtype=dtype, copy=copy)
|
||
|
||
# Cache dtypes
|
||
self._dtypes_cache = pd_df.dtypes
|
||
|
||
# TODO convert _partition_pandas_dataframe to block partitioning.
|
||
row_partitions = \
|
||
_partition_pandas_dataframe(pd_df,
|
||
num_partitions=get_npartitions())
|
||
|
||
self._block_partitions = \
|
||
_create_block_partitions(row_partitions, axis=0,
|
||
length=len(pd_df.columns))
|
||
|
||
# Set in case we were only given a single row/column for below.
|
||
axis = 0
|
||
columns = pd_df.columns
|
||
index = pd_df.index
|
||
else:
|
||
# created this invariant to make sure we never have to go into the
|
||
# partitions to get the columns
|
||
assert columns is not None or col_metadata is not None, \
|
||
"Columns not defined, must define columns or col_metadata " \
|
||
"for internal DataFrame creations"
|
||
|
||
if block_partitions is not None:
|
||
# put in numpy array here to make accesses easier since it's 2D
|
||
self._block_partitions = np.array(block_partitions)
|
||
axis = 0
|
||
else:
|
||
if row_partitions is not None:
|
||
axis = 0
|
||
partitions = row_partitions
|
||
axis_length = len(columns) if columns is not None else \
|
||
len(col_metadata)
|
||
elif col_partitions is not None:
|
||
axis = 1
|
||
partitions = col_partitions
|
||
axis_length = None
|
||
# All partitions will already have correct dtypes
|
||
self._dtypes_cache = [
|
||
_deploy_func.remote(lambda df: df.dtypes, pd_df)
|
||
for pd_df in col_partitions
|
||
]
|
||
|
||
# TODO: write explicit tests for "short and wide"
|
||
# column partitions
|
||
self._block_partitions = \
|
||
_create_block_partitions(partitions, axis=axis,
|
||
length=axis_length)
|
||
|
||
# Sometimes we only get a single column or row, which is
|
||
# problematic for building blocks from the partitions, so we
|
||
# add whatever dimension we're missing from the input.
|
||
if self._block_partitions.ndim < 2:
|
||
self._block_partitions = np.expand_dims(self._block_partitions,
|
||
axis=axis ^ 1)
|
||
|
||
assert self._block_partitions.ndim == 2, "Block Partitions must be 2D."
|
||
|
||
# Create the row and column index objects for using our partitioning.
|
||
# If the objects haven't been inherited, then generate them
|
||
if row_metadata is not None:
|
||
self._row_metadata = row_metadata.copy()
|
||
if index is not None:
|
||
self.index = index
|
||
else:
|
||
self._row_metadata = _IndexMetadata(self._block_partitions[:, 0],
|
||
index=index, axis=0)
|
||
|
||
if col_metadata is not None:
|
||
self._col_metadata = col_metadata.copy()
|
||
if columns is not None:
|
||
self.columns = columns
|
||
else:
|
||
self._col_metadata = _IndexMetadata(self._block_partitions[0, :],
|
||
index=columns, axis=1)
|
||
|
||
if self._dtypes_cache is None:
|
||
self._correct_dtypes()
|
||
|
||
def _get_row_partitions(self):
|
||
return [_blocks_to_row.remote(*part)
|
||
for part in self._block_partitions]
|
||
|
||
def _set_row_partitions(self, new_row_partitions):
|
||
self._block_partitions = \
|
||
_create_block_partitions(new_row_partitions, axis=0,
|
||
length=len(self.columns))
|
||
|
||
_row_partitions = property(_get_row_partitions, _set_row_partitions)
|
||
|
||
def _get_col_partitions(self):
|
||
return [_blocks_to_col.remote(*self._block_partitions[:, i])
|
||
for i in range(self._block_partitions.shape[1])]
|
||
|
||
def _set_col_partitions(self, new_col_partitions):
|
||
self._block_partitions = \
|
||
_create_block_partitions(new_col_partitions, axis=1,
|
||
length=len(self.index))
|
||
|
||
_col_partitions = property(_get_col_partitions, _set_col_partitions)
|
||
|
||
def __str__(self):
|
||
return repr(self)
|
||
|
||
def _repr_helper_(self):
|
||
if len(self._row_metadata) <= 60 and \
|
||
len(self._col_metadata) <= 20:
|
||
return to_pandas(self)
|
||
|
||
def head(df, n, get_local_head=False):
|
||
"""Compute the head for this without creating a new DataFrame"""
|
||
if get_local_head:
|
||
return df.head(n)
|
||
|
||
new_dfs = _map_partitions(lambda df: df.head(n),
|
||
df)
|
||
|
||
index = self.index[:n]
|
||
pd_head = pd.concat(ray.get(new_dfs), axis=1, copy=False)
|
||
pd_head.index = index
|
||
pd_head.columns = self.columns
|
||
return pd_head
|
||
|
||
def tail(df, n, get_local_tail=False):
|
||
"""Compute the tail for this without creating a new DataFrame"""
|
||
if get_local_tail:
|
||
return df.tail(n)
|
||
|
||
new_dfs = _map_partitions(lambda df: df.tail(n),
|
||
df)
|
||
|
||
index = self.index[-n:]
|
||
pd_tail = pd.concat(ray.get(new_dfs), axis=1, copy=False)
|
||
pd_tail.index = index
|
||
pd_tail.columns = self.columns
|
||
return pd_tail
|
||
|
||
def front(df, n):
|
||
"""Get first n columns without creating a new Dataframe"""
|
||
|
||
cum_col_lengths = self._col_metadata._lengths.cumsum()
|
||
index = np.argmax(cum_col_lengths >= 10)
|
||
pd_front = pd.concat(ray.get(x[:index+1]), axis=1, copy=False)
|
||
pd_front = pd_front.iloc[:, :n]
|
||
pd_front.index = self.index
|
||
pd_front.columns = self.columns[:n]
|
||
return pd_front
|
||
|
||
def back(df, n):
|
||
"""Get last n columns without creating a new Dataframe"""
|
||
|
||
cum_col_lengths = np.flip(self._col_metadata._lengths,
|
||
axis=0).cumsum()
|
||
index = np.argmax(cum_col_lengths >= 10)
|
||
pd_back = pd.concat(ray.get(x[-(index+1):]), axis=1, copy=False)
|
||
pd_back = pd_back.iloc[:, -n:]
|
||
pd_back.index = self.index
|
||
pd_back.columns = self.columns[-n:]
|
||
return pd_back
|
||
|
||
x = self._col_partitions
|
||
get_local_head = False
|
||
|
||
# Get first and last 10 columns if there are more than 20 columns
|
||
if len(self._col_metadata) >= 20:
|
||
get_local_head = True
|
||
front = front(x, 10)
|
||
back = back(x, 10)
|
||
|
||
col_dots = pd.Series(["..."
|
||
for _ in range(len(self.index))])
|
||
col_dots.index = self.index
|
||
col_dots.name = "..."
|
||
x = pd.concat([front, col_dots, back], axis=1)
|
||
|
||
# If less than 60 rows, x is already in the correct format.
|
||
if len(self._row_metadata) < 60:
|
||
return x
|
||
|
||
head = head(x, 30, get_local_head)
|
||
tail = tail(x, 30, get_local_head)
|
||
|
||
# Make the dots in between the head and tail
|
||
row_dots = pd.Series(["..."
|
||
for _ in range(len(head.columns))])
|
||
row_dots.index = head.columns
|
||
row_dots.name = "..."
|
||
|
||
# We have to do it this way or convert dots to a dataframe and
|
||
# transpose. This seems better.
|
||
result = head.append(row_dots).append(tail)
|
||
return result
|
||
|
||
def __repr__(self):
|
||
# We use pandas repr so that we match them.
|
||
if len(self._row_metadata) <= 60 and \
|
||
len(self._col_metadata) <= 20:
|
||
return repr(self._repr_helper_())
|
||
# The split here is so that we don't repr pandas row lengths.
|
||
result = self._repr_helper_()
|
||
final_result = repr(result).rsplit("\n\n", maxsplit=1)[0] + \
|
||
"\n\n[{0} rows x {1} columns]".format(len(self.index),
|
||
len(self.columns))
|
||
return final_result
|
||
|
||
def _repr_html_(self):
|
||
"""repr function for rendering in Jupyter Notebooks like Pandas
|
||
Dataframes.
|
||
|
||
Returns:
|
||
The HTML representation of a Dataframe.
|
||
"""
|
||
# We use pandas _repr_html_ to get a string of the HTML representation
|
||
# of the dataframe.
|
||
if len(self._row_metadata) <= 60 and \
|
||
len(self._col_metadata) <= 20:
|
||
return self._repr_helper_()._repr_html_()
|
||
# We split so that we insert our correct dataframe dimensions.
|
||
result = self._repr_helper_()._repr_html_()
|
||
return result.split("<p>")[0] + \
|
||
"<p>{0} rows x {1} columns</p>\n</div>".format(len(self.index),
|
||
len(self.columns))
|
||
|
||
def _get_index(self):
|
||
"""Get the index for this DataFrame.
|
||
|
||
Returns:
|
||
The union of all indexes across the partitions.
|
||
"""
|
||
return self._row_metadata.index
|
||
|
||
def _set_index(self, new_index):
|
||
"""Set the index for this DataFrame.
|
||
|
||
Args:
|
||
new_index: The new index to set this
|
||
"""
|
||
self._row_metadata.index = new_index
|
||
|
||
index = property(_get_index, _set_index)
|
||
|
||
def _get_columns(self):
|
||
"""Get the columns for this DataFrame.
|
||
|
||
Returns:
|
||
The union of all indexes across the partitions.
|
||
"""
|
||
return self._col_metadata.index
|
||
|
||
def _set_columns(self, new_index):
|
||
"""Set the columns for this DataFrame.
|
||
|
||
Args:
|
||
new_index: The new index to set this
|
||
"""
|
||
self._col_metadata.index = new_index
|
||
|
||
columns = property(_get_columns, _set_columns)
|
||
|
||
def _arithmetic_helper(self, remote_func, axis, level=None):
|
||
# TODO: We don't support `level` right now
|
||
if level is not None:
|
||
raise NotImplementedError("Level not yet supported.")
|
||
|
||
axis = pd.DataFrame()._get_axis_number(axis) if axis is not None \
|
||
else 0
|
||
|
||
oid_series = ray.get(_map_partitions(remote_func,
|
||
self._col_partitions if axis == 0
|
||
else self._row_partitions))
|
||
|
||
if axis == 0:
|
||
# We use the index to get the internal index.
|
||
oid_series = [(oid_series[i], i) for i in range(len(oid_series))]
|
||
|
||
if len(oid_series) > 0:
|
||
for df, partition in oid_series:
|
||
this_partition = \
|
||
self._col_metadata.partition_series(partition)
|
||
df.index = \
|
||
this_partition[this_partition.isin(df.index)].index
|
||
|
||
result_series = pd.concat([obj[0] for obj in oid_series],
|
||
axis=0, copy=False)
|
||
else:
|
||
result_series = pd.concat(oid_series, axis=0, copy=False)
|
||
result_series.index = self.index
|
||
return result_series
|
||
|
||
def _validate_eval_query(self, expr, **kwargs):
|
||
"""Helper function to check the arguments to eval() and query()
|
||
|
||
Args:
|
||
expr: The expression to evaluate. This string cannot contain any
|
||
Python statements, only Python expressions.
|
||
"""
|
||
if isinstance(expr, str) and expr is '':
|
||
raise ValueError("expr cannot be an empty string")
|
||
|
||
if isinstance(expr, str) and '@' in expr:
|
||
raise NotImplementedError("Local variables not yet supported in "
|
||
"eval.")
|
||
|
||
if isinstance(expr, str) and 'not' in expr:
|
||
if 'parser' in kwargs and kwargs['parser'] == 'python':
|
||
raise NotImplementedError("'Not' nodes are not implemented.")
|
||
|
||
@property
|
||
def size(self):
|
||
"""Get the number of elements in the DataFrame.
|
||
|
||
Returns:
|
||
The number of elements in the DataFrame.
|
||
"""
|
||
return len(self.index) * len(self.columns)
|
||
|
||
@property
|
||
def ndim(self):
|
||
"""Get the number of dimensions for this DataFrame.
|
||
|
||
Returns:
|
||
The number of dimensions for this DataFrame.
|
||
"""
|
||
# The number of dimensions is common across all partitions.
|
||
# The first partition will be enough.
|
||
return ray.get(_deploy_func.remote(lambda df: df.ndim,
|
||
self._row_partitions[0]))
|
||
|
||
@property
|
||
def ftypes(self):
|
||
"""Get the ftypes for this DataFrame.
|
||
|
||
Returns:
|
||
The ftypes for this DataFrame.
|
||
"""
|
||
# The ftypes are common across all partitions.
|
||
# The first partition will be enough.
|
||
result = ray.get(_deploy_func.remote(lambda df: df.ftypes,
|
||
self._row_partitions[0]))
|
||
result.index = self.columns
|
||
return result
|
||
|
||
def _correct_dtypes(self):
|
||
"""Corrects dtypes by concatenating column blocks and then splitting them
|
||
apart back into the original blocks.
|
||
|
||
Also caches ObjectIDs for the dtypes of every column.
|
||
|
||
Args:
|
||
block_partitions: arglist of column blocks.
|
||
"""
|
||
if self._block_partitions.shape[0] > 1:
|
||
self._block_partitions = np.array(
|
||
[_correct_column_dtypes._submit(
|
||
args=column, num_return_vals=len(column))
|
||
for column in self._block_partitions.T]).T
|
||
|
||
self._dtypes_cache = [_deploy_func.remote(lambda df: df.dtypes, pd_df)
|
||
for pd_df in self._block_partitions[0]]
|
||
|
||
@property
|
||
def dtypes(self):
|
||
"""Get the dtypes for this DataFrame.
|
||
|
||
Returns:
|
||
The dtypes for this DataFrame.
|
||
"""
|
||
assert self._dtypes_cache is not None
|
||
|
||
if isinstance(self._dtypes_cache, list) and \
|
||
isinstance(self._dtypes_cache[0],
|
||
ray.local_scheduler.ObjectID):
|
||
self._dtypes_cache = pd.concat(ray.get(self._dtypes_cache))
|
||
self._dtypes_cache.index = self.columns
|
||
|
||
return self._dtypes_cache
|
||
|
||
@property
|
||
def empty(self):
|
||
"""Determines if the DataFrame is empty.
|
||
|
||
Returns:
|
||
True if the DataFrame is empty.
|
||
False otherwise.
|
||
"""
|
||
all_empty = ray.get(_map_partitions(
|
||
lambda df: df.empty, self._row_partitions))
|
||
return False not in all_empty
|
||
|
||
@property
|
||
def values(self):
|
||
"""Create a numpy array with the values from this DataFrame.
|
||
|
||
Returns:
|
||
The numpy representation of this DataFrame.
|
||
"""
|
||
return np.concatenate(ray.get(_map_partitions(
|
||
lambda df: df.values, self._row_partitions)))
|
||
|
||
@property
|
||
def axes(self):
|
||
"""Get the axes for the DataFrame.
|
||
|
||
Returns:
|
||
The axes for the DataFrame.
|
||
"""
|
||
return [self.index, self.columns]
|
||
|
||
@property
|
||
def shape(self):
|
||
"""Get the size of each of the dimensions in the DataFrame.
|
||
|
||
Returns:
|
||
A tuple with the size of each dimension as they appear in axes().
|
||
"""
|
||
return len(self.index), len(self.columns)
|
||
|
||
def _update_inplace(self, row_partitions=None, col_partitions=None,
|
||
block_partitions=None, columns=None, index=None,
|
||
col_metadata=None, row_metadata=None):
|
||
"""Updates the current DataFrame inplace.
|
||
|
||
Behavior should be similar to the constructor, given the corresponding
|
||
arguments. Note that len(columns) and len(index) should match the
|
||
corresponding dimensions in the partition(s) passed in, otherwise this
|
||
function will complain.
|
||
|
||
Args:
|
||
row_partitions ([ObjectID]):
|
||
The new partitions to replace self._row_partitions directly
|
||
col_partitions ([ObjectID]):
|
||
The new partitions to replace self._col_partitions directly
|
||
columns (pd.Index):
|
||
Index of the column dimension to replace existing columns
|
||
index (pd.Index):
|
||
Index of the row dimension to replace existing index
|
||
|
||
Note:
|
||
If `columns` or `index` are not supplied, they will revert to
|
||
default columns or index respectively, as this function does
|
||
not have enough contextual info to rebuild the indexes
|
||
correctly based on the addition/subtraction of rows/columns.
|
||
"""
|
||
assert row_partitions is not None or col_partitions is not None\
|
||
or block_partitions is not None, \
|
||
"To update inplace, new column or row partitions must be set."
|
||
|
||
if block_partitions is not None:
|
||
self._block_partitions = block_partitions
|
||
|
||
elif row_partitions is not None:
|
||
self._row_partitions = row_partitions
|
||
|
||
elif col_partitions is not None:
|
||
self._col_partitions = col_partitions
|
||
|
||
if col_metadata is not None:
|
||
self._col_metadata = col_metadata
|
||
else:
|
||
assert columns is not None, \
|
||
"If col_metadata is None, columns must be passed in"
|
||
self._col_metadata = _IndexMetadata(
|
||
self._block_partitions[0, :], index=columns, axis=1)
|
||
if row_metadata is not None:
|
||
self._row_metadata = row_metadata
|
||
else:
|
||
# Index can be None for default index, so we don't check
|
||
self._row_metadata = _IndexMetadata(
|
||
self._block_partitions[:, 0], index=index, axis=0)
|
||
|
||
# Update dtypes
|
||
self._correct_dtypes()
|
||
|
||
def add_prefix(self, prefix):
|
||
"""Add a prefix to each of the column names.
|
||
|
||
Returns:
|
||
A new DataFrame containing the new column names.
|
||
"""
|
||
new_cols = self.columns.map(lambda x: str(prefix) + str(x))
|
||
return DataFrame(block_partitions=self._block_partitions,
|
||
columns=new_cols,
|
||
col_metadata=self._col_metadata,
|
||
row_metadata=self._row_metadata)
|
||
|
||
def add_suffix(self, suffix):
|
||
"""Add a suffix to each of the column names.
|
||
|
||
Returns:
|
||
A new DataFrame containing the new column names.
|
||
"""
|
||
new_cols = self.columns.map(lambda x: str(x) + str(suffix))
|
||
return DataFrame(block_partitions=self._block_partitions,
|
||
columns=new_cols,
|
||
col_metadata=self._col_metadata,
|
||
row_metadata=self._row_metadata)
|
||
|
||
def applymap(self, func):
|
||
"""Apply a function to a DataFrame elementwise.
|
||
|
||
Args:
|
||
func (callable): The function to apply.
|
||
"""
|
||
if not callable(func):
|
||
raise ValueError(
|
||
"\'{0}\' object is not callable".format(type(func)))
|
||
|
||
new_block_partitions = np.array([
|
||
_map_partitions(lambda df: df.applymap(func), block)
|
||
for block in self._block_partitions])
|
||
|
||
return DataFrame(block_partitions=new_block_partitions,
|
||
row_metadata=self._row_metadata,
|
||
col_metadata=self._col_metadata)
|
||
|
||
def copy(self, deep=True):
|
||
"""Creates a shallow copy of the DataFrame.
|
||
|
||
Returns:
|
||
A new DataFrame pointing to the same partitions as this one.
|
||
"""
|
||
return DataFrame(block_partitions=self._block_partitions,
|
||
columns=self.columns,
|
||
index=self.index,
|
||
dtypes_cache=self.dtypes)
|
||
|
||
def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
|
||
group_keys=True, squeeze=False, **kwargs):
|
||
"""Apply a groupby to this DataFrame. See _groupby() remote task.
|
||
Args:
|
||
by: The value to groupby.
|
||
axis: The axis to groupby.
|
||
level: The level of the groupby.
|
||
as_index: Whether or not to store result as index.
|
||
sort: Whether or not to sort the result by the index.
|
||
group_keys: Whether or not to group the keys.
|
||
squeeze: Whether or not to squeeze.
|
||
Returns:
|
||
A new DataFrame resulting from the groupby.
|
||
"""
|
||
axis = pd.DataFrame()._get_axis_number(axis)
|
||
if callable(by):
|
||
by = by(self.index)
|
||
elif isinstance(by, compat.string_types):
|
||
by = self.__getitem__(by).values.tolist()
|
||
elif is_list_like(by):
|
||
mismatch = len(by) != len(self) if axis == 0 \
|
||
else len(by) != len(self.columns)
|
||
|
||
if all([obj in self for obj in by]) and mismatch:
|
||
raise NotImplementedError(
|
||
"Groupby with lists of columns not yet supported.")
|
||
elif mismatch:
|
||
raise KeyError(next(x for x in by if x not in self))
|
||
|
||
return DataFrameGroupBy(self, by, axis, level, as_index, sort,
|
||
group_keys, squeeze, **kwargs)
|
||
|
||
def sum(self, axis=None, skipna=True, level=None, numeric_only=None,
|
||
min_count=1, **kwargs):
|
||
"""Perform a sum across the DataFrame.
|
||
|
||
Args:
|
||
axis (int): The axis to sum on.
|
||
skipna (bool): True to skip NA values, false otherwise.
|
||
|
||
Returns:
|
||
The sum of the DataFrame.
|
||
"""
|
||
def remote_func(df):
|
||
return df.sum(axis=axis, skipna=skipna, level=level,
|
||
numeric_only=numeric_only, min_count=min_count,
|
||
**kwargs)
|
||
|
||
return self._arithmetic_helper(remote_func, axis, level)
|
||
|
||
def abs(self):
|
||
"""Apply an absolute value function to all numberic columns.
|
||
|
||
Returns:
|
||
A new DataFrame with the applied absolute value.
|
||
"""
|
||
for t in self.dtypes:
|
||
if np.dtype('O') == t:
|
||
# TODO Give a more accurate error to Pandas
|
||
raise TypeError("bad operand type for abs():", "str")
|
||
|
||
new_block_partitions = np.array([_map_partitions(lambda df: df.abs(),
|
||
block)
|
||
for block in self._block_partitions])
|
||
|
||
return DataFrame(block_partitions=new_block_partitions,
|
||
columns=self.columns,
|
||
index=self.index)
|
||
|
||
def isin(self, values):
|
||
"""Fill a DataFrame with booleans for cells contained in values.
|
||
|
||
Args:
|
||
values (iterable, DataFrame, Series, or dict): The values to find.
|
||
|
||
Returns:
|
||
A new DataFrame with booleans representing whether or not a cell
|
||
is in values.
|
||
True: cell is contained in values.
|
||
False: otherwise
|
||
"""
|
||
new_block_partitions = np.array([_map_partitions(
|
||
lambda df: df.isin(values), block)
|
||
for block in self._block_partitions])
|
||
|
||
return DataFrame(block_partitions=new_block_partitions,
|
||
columns=self.columns,
|
||
index=self.index)
|
||
|
||
def isna(self):
|
||
"""Fill a DataFrame with booleans for cells containing NA.
|
||
|
||
Returns:
|
||
A new DataFrame with booleans representing whether or not a cell
|
||
is NA.
|
||
True: cell contains NA.
|
||
False: otherwise.
|
||
"""
|
||
new_block_partitions = np.array([_map_partitions(
|
||
lambda df: df.isna(), block) for block in self._block_partitions])
|
||
|
||
return DataFrame(block_partitions=new_block_partitions,
|
||
row_metadata=self._row_metadata,
|
||
col_metadata=self._col_metadata)
|
||
|
||
def isnull(self):
|
||
"""Fill a DataFrame with booleans for cells containing a null value.
|
||
|
||
Returns:
|
||
A new DataFrame with booleans representing whether or not a cell
|
||
is null.
|
||
True: cell contains null.
|
||
False: otherwise.
|
||
"""
|
||
new_block_partitions = np.array([_map_partitions(
|
||
lambda df: df.isnull(), block)
|
||
for block in self._block_partitions])
|
||
|
||
return DataFrame(block_partitions=new_block_partitions,
|
||
row_metadata=self._row_metadata,
|
||
col_metadata=self._col_metadata)
|
||
|
||
def keys(self):
|
||
"""Get the info axis for the DataFrame.
|
||
|
||
Returns:
|
||
A pandas Index for this DataFrame.
|
||
"""
|
||
# Each partition should have the same index, so we'll use 0's
|
||
return self.columns
|
||
|
||
def transpose(self, *args, **kwargs):
|
||
"""Transpose columns and rows for the DataFrame.
|
||
|
||
Returns:
|
||
A new DataFrame transposed from this DataFrame.
|
||
"""
|
||
new_block_partitions = np.array([_map_partitions(
|
||
lambda df: df.T, block) for block in self._block_partitions])
|
||
|
||
return DataFrame(block_partitions=new_block_partitions.T,
|
||
columns=self.index,
|
||
index=self.columns)
|
||
|
||
T = property(transpose)
|
||
|
||
def dropna(self, axis=0, how='any', thresh=None, subset=None,
|
||
inplace=False):
|
||
"""Create a new DataFrame from the removed NA values from this one.
|
||
|
||
Args:
|
||
axis (int, tuple, or list): The axis to apply the drop.
|
||
how (str): How to drop the NA values.
|
||
'all': drop the label if all values are NA.
|
||
'any': drop the label if any values are NA.
|
||
thresh (int): The minimum number of NAs to require.
|
||
subset ([label]): Labels to consider from other axis.
|
||
inplace (bool): Change this DataFrame or return a new DataFrame.
|
||
True: Modify the data for this DataFrame, return None.
|
||
False: Create a new DataFrame and return it.
|
||
|
||
Returns:
|
||
If inplace is set to True, returns None, otherwise returns a new
|
||
DataFrame with the dropna applied.
|
||
"""
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
|
||
if is_list_like(axis):
|
||
axis = [pd.DataFrame()._get_axis_number(ax) for ax in axis]
|
||
|
||
result = self
|
||
# TODO(kunalgosar): this builds an intermediate dataframe,
|
||
# which does unnecessary computation
|
||
for ax in axis:
|
||
result = result.dropna(
|
||
axis=ax, how=how, thresh=thresh, subset=subset)
|
||
if not inplace:
|
||
return result
|
||
|
||
self._update_inplace(block_partitions=result._block_partitions,
|
||
columns=result.columns,
|
||
index=result.index)
|
||
|
||
return None
|
||
|
||
axis = pd.DataFrame()._get_axis_number(axis)
|
||
|
||
if how is not None and how not in ['any', 'all']:
|
||
raise ValueError('invalid how option: %s' % how)
|
||
if how is None and thresh is None:
|
||
raise TypeError('must specify how or thresh')
|
||
|
||
if subset is not None:
|
||
subset = set(subset)
|
||
|
||
if axis == 1:
|
||
subset = [item for item in self.index if item in subset]
|
||
else:
|
||
subset = [item for item in self.columns if item in subset]
|
||
|
||
def dropna_helper(df):
|
||
new_df = df.dropna(axis=axis, how=how, thresh=thresh,
|
||
subset=subset, inplace=False)
|
||
|
||
if axis == 1:
|
||
new_index = new_df.columns
|
||
new_df.columns = pd.RangeIndex(0, len(new_df.columns))
|
||
else:
|
||
new_index = new_df.index
|
||
new_df.reset_index(drop=True, inplace=True)
|
||
|
||
return new_df, new_index
|
||
|
||
parts = self._col_partitions if axis == 1 else self._row_partitions
|
||
result = [_deploy_func._submit(args=(dropna_helper, df),
|
||
num_return_vals=2) for df in parts]
|
||
new_parts, new_vals = [list(t) for t in zip(*result)]
|
||
|
||
if axis == 1:
|
||
new_vals = [self._col_metadata.get_global_indices(i, vals)
|
||
for i, vals in enumerate(ray.get(new_vals))]
|
||
|
||
# This flattens the 2d array to 1d
|
||
new_vals = [i for j in new_vals for i in j]
|
||
new_cols = self.columns[new_vals]
|
||
|
||
if not inplace:
|
||
return DataFrame(col_partitions=new_parts,
|
||
columns=new_cols,
|
||
index=self.index)
|
||
|
||
self._update_inplace(col_partitions=new_parts,
|
||
columns=new_cols,
|
||
index=self.index)
|
||
|
||
else:
|
||
new_vals = [self._row_metadata.get_global_indices(i, vals)
|
||
for i, vals in enumerate(ray.get(new_vals))]
|
||
|
||
# This flattens the 2d array to 1d
|
||
new_vals = [i for j in new_vals for i in j]
|
||
new_rows = self.index[new_vals]
|
||
|
||
if not inplace:
|
||
return DataFrame(row_partitions=new_parts,
|
||
index=new_rows,
|
||
columns=self.columns)
|
||
|
||
self._update_inplace(row_partitions=new_parts,
|
||
index=new_rows,
|
||
columns=self.columns)
|
||
|
||
return None
|
||
|
||
def add(self, other, axis='columns', level=None, fill_value=None):
|
||
"""Add this DataFrame to another or a scalar/list.
|
||
|
||
Args:
|
||
other: What to add this this DataFrame.
|
||
axis: The axis to apply addition over. Only applicaable to Series
|
||
or list 'other'.
|
||
level: A level in the multilevel axis to add over.
|
||
fill_value: The value to fill NaN.
|
||
|
||
Returns:
|
||
A new DataFrame with the applied addition.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.add, other, axis, level,
|
||
fill_value)
|
||
|
||
def agg(self, func, axis=0, *args, **kwargs):
|
||
return self.aggregate(func, axis, *args, **kwargs)
|
||
|
||
def aggregate(self, func, axis=0, *args, **kwargs):
|
||
axis = pd.DataFrame()._get_axis_number(axis)
|
||
|
||
result = None
|
||
|
||
if axis == 0:
|
||
try:
|
||
result = self._aggregate(func, axis=axis, *args, **kwargs)
|
||
except TypeError:
|
||
pass
|
||
|
||
if result is None:
|
||
kwargs.pop('is_transform', None)
|
||
return self.apply(func, axis=axis, args=args, **kwargs)
|
||
|
||
return result
|
||
|
||
def _aggregate(self, arg, *args, **kwargs):
|
||
_axis = kwargs.pop('_axis', None)
|
||
if _axis is None:
|
||
_axis = getattr(self, 'axis', 0)
|
||
kwargs.pop('_level', None)
|
||
|
||
if isinstance(arg, compat.string_types):
|
||
return self._string_function(arg, *args, **kwargs)
|
||
|
||
# Dictionaries have complex behavior because they can be renamed here.
|
||
elif isinstance(arg, dict):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
elif is_list_like(arg):
|
||
return self.apply(arg, axis=_axis, args=args, **kwargs)
|
||
elif callable(arg):
|
||
self._callable_function(arg, _axis, *args, **kwargs)
|
||
else:
|
||
# TODO Make pandas error
|
||
raise ValueError("type {} is not callable".format(type(arg)))
|
||
|
||
def _string_function(self, func, *args, **kwargs):
|
||
assert isinstance(func, compat.string_types)
|
||
|
||
f = getattr(self, func, None)
|
||
|
||
if f is not None:
|
||
if callable(f):
|
||
return f(*args, **kwargs)
|
||
|
||
assert len(args) == 0
|
||
assert len([kwarg
|
||
for kwarg in kwargs
|
||
if kwarg not in ['axis', '_level']]) == 0
|
||
return f
|
||
|
||
f = getattr(np, func, None)
|
||
if f is not None:
|
||
raise NotImplementedError("Numpy aggregates not yet supported.")
|
||
|
||
raise ValueError("{} is an unknown string function".format(func))
|
||
|
||
def _callable_function(self, func, axis, *args, **kwargs):
|
||
if axis == 0:
|
||
partitions = self._col_partitions
|
||
else:
|
||
partitions = self._row_partitions
|
||
|
||
if axis == 1:
|
||
kwargs['axis'] = axis
|
||
kwargs['temp_columns'] = self.columns
|
||
else:
|
||
kwargs['temp_index'] = self.index
|
||
|
||
def agg_helper(df, arg, *args, **kwargs):
|
||
if 'temp_index' in kwargs:
|
||
df.index = kwargs.pop('temp_index', None)
|
||
else:
|
||
df.columns = kwargs.pop('temp_columns', None)
|
||
is_transform = kwargs.pop('is_transform', False)
|
||
new_df = df.agg(arg, *args, **kwargs)
|
||
|
||
is_series = False
|
||
|
||
if isinstance(new_df, pd.Series):
|
||
is_series = True
|
||
index = None
|
||
columns = None
|
||
else:
|
||
index = new_df.index \
|
||
if not isinstance(new_df.index, pd.RangeIndex) \
|
||
else None
|
||
columns = new_df.columns
|
||
new_df.columns = pd.RangeIndex(0, len(new_df.columns))
|
||
new_df.reset_index(drop=True, inplace=True)
|
||
|
||
if is_transform:
|
||
if is_scalar(new_df) or len(new_df) != len(df):
|
||
raise ValueError("transforms cannot produce "
|
||
"aggregated results")
|
||
|
||
return is_series, new_df, index, columns
|
||
|
||
remote_result = \
|
||
[_deploy_func._submit(args=(lambda df: agg_helper(df,
|
||
func,
|
||
*args,
|
||
**kwargs),
|
||
part), num_return_vals=4)
|
||
for part in partitions]
|
||
|
||
# This magic transposes the list comprehension returned from remote
|
||
is_series, new_parts, index, columns = \
|
||
[list(t) for t in zip(*remote_result)]
|
||
|
||
# This part is because agg can allow returning a Series or a
|
||
# DataFrame, and we have to determine which here. Shouldn't add
|
||
# too much to latency in either case because the booleans can
|
||
# be returned immediately
|
||
is_series = ray.get(is_series)
|
||
if all(is_series):
|
||
new_series = pd.concat(ray.get(new_parts))
|
||
new_series.index = self.columns if axis == 0 else self.index
|
||
return new_series
|
||
# This error is thrown when some of the partitions return Series and
|
||
# others return DataFrames. We do not allow mixed returns.
|
||
elif any(is_series):
|
||
raise ValueError("no results.")
|
||
# The remaining logic executes when we have only DataFrames in the
|
||
# remote objects. We build a Ray DataFrame from the Pandas partitions.
|
||
elif axis == 0:
|
||
new_index = ray.get(index[0])
|
||
columns = ray.get(columns)
|
||
columns = columns[0].append(columns[1:])
|
||
|
||
return DataFrame(col_partitions=new_parts,
|
||
columns=columns,
|
||
index=self.index if new_index is None
|
||
else new_index)
|
||
else:
|
||
new_index = ray.get(index[0])
|
||
columns = ray.get(columns)
|
||
columns = columns[0].append(columns[1:])
|
||
return DataFrame(row_partitions=new_parts,
|
||
columns=columns,
|
||
index=self.index if new_index is None
|
||
else new_index)
|
||
|
||
def align(self, other, join='outer', axis=None, level=None, copy=True,
|
||
fill_value=None, method=None, limit=None, fill_axis=0,
|
||
broadcast_axis=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def all(self, axis=None, bool_only=None, skipna=None, level=None,
|
||
**kwargs):
|
||
"""Return whether all elements are True over requested axis
|
||
|
||
Note:
|
||
If axis=None or axis=0, this call applies df.all(axis=1)
|
||
to the transpose of df.
|
||
"""
|
||
def remote_func(df):
|
||
return df.all(axis=axis, bool_only=bool_only, skipna=skipna,
|
||
level=level, **kwargs)
|
||
|
||
return self._arithmetic_helper(remote_func, axis, level)
|
||
|
||
def any(self, axis=None, bool_only=None, skipna=None, level=None,
|
||
**kwargs):
|
||
"""Return whether any elements are True over requested axis
|
||
|
||
Note:
|
||
If axis=None or axis=0, this call applies on the column partitions,
|
||
otherwise operates on row partitions
|
||
"""
|
||
def remote_func(df):
|
||
return df.any(axis=axis, bool_only=bool_only, skipna=skipna,
|
||
level=level, **kwargs)
|
||
|
||
return self._arithmetic_helper(remote_func, axis, level)
|
||
|
||
def append(self, other, ignore_index=False, verify_integrity=False):
|
||
"""Append another DataFrame/list/Series to this one.
|
||
|
||
Args:
|
||
other: The object to append to this.
|
||
ignore_index: Ignore the index on appending.
|
||
verify_integrity: Verify the integrity of the index on completion.
|
||
|
||
Returns:
|
||
A new DataFrame containing the concatenated values.
|
||
"""
|
||
if isinstance(other, (pd.Series, dict)):
|
||
if isinstance(other, dict):
|
||
other = pd.Series(other)
|
||
if other.name is None and not ignore_index:
|
||
raise TypeError('Can only append a Series if ignore_index=True'
|
||
' or if the Series has a name')
|
||
|
||
if other.name is None:
|
||
index = None
|
||
else:
|
||
# other must have the same index name as self, otherwise
|
||
# index name will be reset
|
||
index = pd.Index([other.name], name=self.index.name)
|
||
|
||
combined_columns = self.columns.tolist() + self.columns.union(
|
||
other.index).difference(self.columns).tolist()
|
||
other = other.reindex(combined_columns, copy=False)
|
||
other = pd.DataFrame(other.values.reshape((1, len(other))),
|
||
index=index,
|
||
columns=combined_columns)
|
||
other = other._convert(datetime=True, timedelta=True)
|
||
elif isinstance(other, list) and not isinstance(other[0], DataFrame):
|
||
other = pd.DataFrame(other)
|
||
if (self.columns.get_indexer(other.columns) >= 0).all():
|
||
other = other.loc[:, self.columns]
|
||
|
||
from .concat import concat
|
||
if isinstance(other, (list, tuple)):
|
||
to_concat = [self] + other
|
||
else:
|
||
to_concat = [self, other]
|
||
|
||
return concat(to_concat, ignore_index=ignore_index,
|
||
verify_integrity=verify_integrity)
|
||
|
||
def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
|
||
args=(), **kwds):
|
||
"""Apply a function along input axis of DataFrame.
|
||
|
||
Args:
|
||
func: The function to apply
|
||
axis: The axis over which to apply the func.
|
||
broadcast: Whether or not to broadcast.
|
||
raw: Whether or not to convert to a Series.
|
||
reduce: Whether or not to try to apply reduction procedures.
|
||
|
||
Returns:
|
||
Series or DataFrame, depending on func.
|
||
"""
|
||
axis = pd.DataFrame()._get_axis_number(axis)
|
||
|
||
if isinstance(func, compat.string_types):
|
||
if axis == 1:
|
||
kwds['axis'] = axis
|
||
return getattr(self, func)(*args, **kwds)
|
||
elif isinstance(func, dict):
|
||
if axis == 1:
|
||
raise TypeError(
|
||
"(\"'dict' object is not callable\", "
|
||
"'occurred at index {0}'".format(self.index[0]))
|
||
if len(self.columns) != len(set(self.columns)):
|
||
warnings.warn(
|
||
'duplicate column names not supported with apply().',
|
||
FutureWarning, stacklevel=2)
|
||
has_list = list in map(type, func.values())
|
||
part_ind_tuples = [(self._col_metadata[key], key) for key in func]
|
||
|
||
if has_list:
|
||
# if input dict has a list, the function to apply must wrap
|
||
# single functions in lists as well to get the desired output
|
||
# format
|
||
result = [_deploy_func.remote(
|
||
lambda df: df.iloc[:, ind].apply(
|
||
func[key] if is_list_like(func[key])
|
||
else [func[key]]),
|
||
self._col_partitions[part])
|
||
for (part, ind), key in part_ind_tuples]
|
||
return pd.concat(ray.get(result), axis=1)
|
||
else:
|
||
result = [_deploy_func.remote(
|
||
lambda df: df.iloc[:, ind].apply(func[key]),
|
||
self._col_partitions[part])
|
||
for (part, ind), key in part_ind_tuples]
|
||
return pd.Series(ray.get(result), index=func.keys())
|
||
|
||
elif is_list_like(func):
|
||
if axis == 1:
|
||
raise TypeError(
|
||
"(\"'list' object is not callable\", "
|
||
"'occurred at index {0}'".format(self.index[0]))
|
||
# TODO: some checking on functions that return Series or Dataframe
|
||
new_cols = _map_partitions(lambda df: df.apply(func),
|
||
self._col_partitions)
|
||
|
||
# resolve function names for the DataFrame index
|
||
new_index = [f_name if isinstance(f_name, compat.string_types)
|
||
else f_name.__name__ for f_name in func]
|
||
return DataFrame(col_partitions=new_cols,
|
||
columns=self.columns,
|
||
index=new_index,
|
||
col_metadata=self._col_metadata)
|
||
elif callable(func):
|
||
return self._callable_function(func, axis=axis, *args, **kwds)
|
||
|
||
def as_blocks(self, copy=True):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def as_matrix(self, columns=None):
|
||
"""Convert the frame to its Numpy-array representation.
|
||
|
||
Args:
|
||
columns: If None, return all columns, otherwise,
|
||
returns specified columns.
|
||
|
||
Returns:
|
||
values: ndarray
|
||
"""
|
||
# TODO this is very inneficient, also see __array__
|
||
return to_pandas(self).as_matrix(columns)
|
||
|
||
def asfreq(self, freq, method=None, how=None, normalize=False,
|
||
fill_value=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def asof(self, where, subset=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def assign(self, **kwargs):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def astype(self, dtype, copy=True, errors='raise', **kwargs):
|
||
if isinstance(dtype, dict):
|
||
if (not set(dtype.keys()).issubset(set(self.columns)) and
|
||
errors == 'raise'):
|
||
raise KeyError(
|
||
"Only a column name can be used for the key in"
|
||
"a dtype mappings argument.")
|
||
columns = list(dtype.keys())
|
||
col_idx = [(self.columns.get_loc(columns[i]), columns[i])
|
||
if columns[i] in self.columns
|
||
else (columns[i], columns[i])
|
||
for i in range(len(columns))]
|
||
new_dict = {}
|
||
for idx, key in col_idx:
|
||
new_dict[idx] = dtype[key]
|
||
new_rows = _map_partitions(lambda df, dt: df.astype(dtype=dt,
|
||
copy=True,
|
||
errors=errors,
|
||
**kwargs),
|
||
self._row_partitions, new_dict)
|
||
if copy:
|
||
return DataFrame(row_partitions=new_rows,
|
||
columns=self.columns,
|
||
index=self.index)
|
||
self._row_partitions = new_rows
|
||
else:
|
||
new_blocks = [_map_partitions(lambda d: d.astype(dtype=dtype,
|
||
copy=True,
|
||
errors=errors,
|
||
**kwargs),
|
||
block)
|
||
for block in self._block_partitions]
|
||
if copy:
|
||
return DataFrame(block_partitions=new_blocks,
|
||
columns=self.columns,
|
||
index=self.index)
|
||
self._block_partitions = new_blocks
|
||
|
||
def at_time(self, time, asof=False):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def between_time(self, start_time, end_time, include_start=True,
|
||
include_end=True):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
|
||
"""Synonym for DataFrame.fillna(method='bfill')
|
||
"""
|
||
new_df = self.fillna(method='bfill',
|
||
axis=axis,
|
||
limit=limit,
|
||
downcast=downcast,
|
||
inplace=inplace)
|
||
if not inplace:
|
||
return new_df
|
||
|
||
def bool(self):
|
||
"""Return the bool of a single element PandasObject.
|
||
|
||
This must be a boolean scalar value, either True or False. Raise a
|
||
ValueError if the PandasObject does not have exactly 1 element, or that
|
||
element is not boolean
|
||
"""
|
||
shape = self.shape
|
||
if shape != (1,) and shape != (1, 1):
|
||
raise ValueError("""The PandasObject does not have exactly
|
||
1 element. Return the bool of a single
|
||
element PandasObject. The truth value is
|
||
ambiguous. Use a.empty, a.item(), a.any()
|
||
or a.all().""")
|
||
else:
|
||
return to_pandas(self).bool()
|
||
|
||
def boxplot(self, column=None, by=None, ax=None, fontsize=None, rot=0,
|
||
grid=True, figsize=None, layout=None, return_type=None,
|
||
**kwds):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def clip(self, lower=None, upper=None, axis=None, inplace=False, *args,
|
||
**kwargs):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def clip_lower(self, threshold, axis=None, inplace=False):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def clip_upper(self, threshold, axis=None, inplace=False):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def combine(self, other, func, fill_value=None, overwrite=True):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def combine_first(self, other):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def compound(self, axis=None, skipna=None, level=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def consolidate(self, inplace=False):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def convert_objects(self, convert_dates=True, convert_numeric=False,
|
||
convert_timedeltas=True, copy=True):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def corr(self, method='pearson', min_periods=1):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def corrwith(self, other, axis=0, drop=False):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def count(self, axis=0, level=None, numeric_only=False):
|
||
"""Get the count of non-null objects in the DataFrame.
|
||
|
||
Arguments:
|
||
axis: 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
|
||
level: If the axis is a MultiIndex (hierarchical), count along a
|
||
particular level, collapsing into a DataFrame.
|
||
numeric_only: Include only float, int, boolean data
|
||
|
||
Returns:
|
||
The count, in a Series (or DataFrame if level is specified).
|
||
"""
|
||
def remote_func(df):
|
||
return df.count(axis=axis, level=level, numeric_only=numeric_only)
|
||
|
||
return self._arithmetic_helper(remote_func, axis, level)
|
||
|
||
def cov(self, min_periods=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def _cumulative_helper(self, func, axis):
|
||
axis = pd.DataFrame()._get_axis_number(axis) if axis is not None \
|
||
else 0
|
||
|
||
if axis == 0:
|
||
new_cols = _map_partitions(func, self._col_partitions)
|
||
return DataFrame(col_partitions=new_cols,
|
||
row_metadata=self._row_metadata,
|
||
col_metadata=self._col_metadata)
|
||
else:
|
||
new_rows = _map_partitions(func, self._row_partitions)
|
||
return DataFrame(row_partitions=new_rows,
|
||
row_metadata=self._row_metadata,
|
||
col_metadata=self._col_metadata)
|
||
|
||
def cummax(self, axis=None, skipna=True, *args, **kwargs):
|
||
"""Perform a cumulative maximum across the DataFrame.
|
||
|
||
Args:
|
||
axis (int): The axis to take maximum on.
|
||
skipna (bool): True to skip NA values, false otherwise.
|
||
|
||
Returns:
|
||
The cumulative maximum of the DataFrame.
|
||
"""
|
||
def remote_func(df):
|
||
return df.cummax(axis=axis, skipna=skipna, *args, **kwargs)
|
||
|
||
return self._cumulative_helper(remote_func, axis)
|
||
|
||
def cummin(self, axis=None, skipna=True, *args, **kwargs):
|
||
"""Perform a cumulative minimum across the DataFrame.
|
||
|
||
Args:
|
||
axis (int): The axis to cummin on.
|
||
skipna (bool): True to skip NA values, false otherwise.
|
||
|
||
Returns:
|
||
The cumulative minimum of the DataFrame.
|
||
"""
|
||
def remote_func(df):
|
||
return df.cummin(axis=axis, skipna=skipna, *args, **kwargs)
|
||
|
||
return self._cumulative_helper(remote_func, axis)
|
||
|
||
def cumprod(self, axis=None, skipna=True, *args, **kwargs):
|
||
"""Perform a cumulative product across the DataFrame.
|
||
|
||
Args:
|
||
axis (int): The axis to take product on.
|
||
skipna (bool): True to skip NA values, false otherwise.
|
||
|
||
Returns:
|
||
The cumulative product of the DataFrame.
|
||
"""
|
||
def remote_func(df):
|
||
return df.cumprod(axis=axis, skipna=skipna, *args, **kwargs)
|
||
|
||
return self._cumulative_helper(remote_func, axis)
|
||
|
||
def cumsum(self, axis=None, skipna=True, *args, **kwargs):
|
||
"""Perform a cumulative sum across the DataFrame.
|
||
|
||
Args:
|
||
axis (int): The axis to take sum on.
|
||
skipna (bool): True to skip NA values, false otherwise.
|
||
|
||
Returns:
|
||
The cumulative sum of the DataFrame.
|
||
"""
|
||
def remote_func(df):
|
||
return df.cumsum(axis=axis, skipna=skipna, *args, **kwargs)
|
||
|
||
return self._cumulative_helper(remote_func, axis)
|
||
|
||
def describe(self, percentiles=None, include=None, exclude=None):
|
||
"""
|
||
Generates descriptive statistics that summarize the central tendency,
|
||
dispersion and shape of a dataset's distribution, excluding NaN values.
|
||
|
||
Args:
|
||
percentiles (list-like of numbers, optional):
|
||
The percentiles to include in the output.
|
||
include: White-list of data types to include in results
|
||
exclude: Black-list of data types to exclude in results
|
||
|
||
Returns: Series/DataFrame of summary statistics
|
||
"""
|
||
def describe_helper(df):
|
||
"""This to ensure nothing goes on with non-numeric columns"""
|
||
try:
|
||
return df.select_dtypes(exclude='object').describe(
|
||
percentiles=percentiles,
|
||
include=include,
|
||
exclude=exclude)
|
||
# This exception is thrown when there are only non-numeric columns
|
||
# in this partition
|
||
except ValueError:
|
||
return pd.DataFrame()
|
||
|
||
# Begin fixing index based on the columns inside.
|
||
parts = ray.get(_map_partitions(describe_helper, self._col_partitions))
|
||
# We use the index to get the internal index.
|
||
parts = [(parts[i], i) for i in range(len(parts))]
|
||
|
||
for df, partition in parts:
|
||
this_partition = self._col_metadata.partition_series(partition)
|
||
df.columns = this_partition[this_partition.isin(df.columns)].index
|
||
|
||
# Remove index from tuple
|
||
result = pd.concat([obj[0] for obj in parts], axis=1, copy=False)
|
||
return result
|
||
|
||
def diff(self, periods=1, axis=0):
|
||
"""Finds the difference between elements on the axis requested
|
||
|
||
Args:
|
||
periods: Periods to shift for forming difference
|
||
axis: Take difference over rows or columns
|
||
|
||
Returns:
|
||
DataFrame with the diff applied
|
||
"""
|
||
axis = pd.DataFrame()._get_axis_number(axis)
|
||
partitions = (self._col_partitions if
|
||
axis == 0 else self._row_partitions)
|
||
|
||
result = _map_partitions(lambda df:
|
||
df.diff(axis=axis, periods=periods),
|
||
partitions)
|
||
|
||
if (axis == 1):
|
||
return DataFrame(row_partitions=result,
|
||
columns=self.columns,
|
||
index=self.index)
|
||
if (axis == 0):
|
||
return DataFrame(col_partitions=result,
|
||
columns=self.columns,
|
||
index=self.index)
|
||
|
||
def div(self, other, axis='columns', level=None, fill_value=None):
|
||
"""Divides this DataFrame against another DataFrame/Series/scalar.
|
||
|
||
Args:
|
||
other: The object to use to apply the divide against this.
|
||
axis: The axis to divide over.
|
||
level: The Multilevel index level to apply divide over.
|
||
fill_value: The value to fill NaNs with.
|
||
|
||
Returns:
|
||
A new DataFrame with the Divide applied.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.add, other, axis, level,
|
||
fill_value)
|
||
|
||
def divide(self, other, axis='columns', level=None, fill_value=None):
|
||
"""Synonym for div.
|
||
|
||
Args:
|
||
other: The object to use to apply the divide against this.
|
||
axis: The axis to divide over.
|
||
level: The Multilevel index level to apply divide over.
|
||
fill_value: The value to fill NaNs with.
|
||
|
||
Returns:
|
||
A new DataFrame with the Divide applied.
|
||
"""
|
||
return self.div(other, axis, level, fill_value)
|
||
|
||
def dot(self, other):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def drop(self, labels=None, axis=0, index=None, columns=None, level=None,
|
||
inplace=False, errors='raise'):
|
||
"""Return new object with labels in requested axis removed.
|
||
Args:
|
||
labels: Index or column labels to drop.
|
||
|
||
axis: Whether to drop labels from the index (0 / 'index') or
|
||
columns (1 / 'columns').
|
||
|
||
index, columns: Alternative to specifying axis (labels, axis=1 is
|
||
equivalent to columns=labels).
|
||
|
||
level: For MultiIndex
|
||
|
||
inplace: If True, do operation inplace and return None.
|
||
|
||
errors: If 'ignore', suppress error and existing labels are
|
||
dropped.
|
||
Returns:
|
||
dropped : type of caller
|
||
"""
|
||
# TODO implement level
|
||
if level is not None:
|
||
raise NotImplementedError("Level not yet supported for drop")
|
||
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
if labels is not None:
|
||
if index is not None or columns is not None:
|
||
raise ValueError("Cannot specify both 'labels' and "
|
||
"'index'/'columns'")
|
||
axis = pd.DataFrame()._get_axis_name(axis)
|
||
axes = {axis: labels}
|
||
elif index is not None or columns is not None:
|
||
axes, _ = pd.DataFrame()._construct_axes_from_arguments((index,
|
||
columns),
|
||
{})
|
||
else:
|
||
raise ValueError("Need to specify at least one of 'labels', "
|
||
"'index' or 'columns'")
|
||
obj = self.copy()
|
||
|
||
def drop_helper(obj, axis, label):
|
||
# TODO(patyang): If you drop from the index first, you can do it
|
||
# in batch by returning the dropped items. Likewise coords.drop
|
||
# leaves the coords df in an inconsistent state.
|
||
if axis == 'index':
|
||
try:
|
||
coords = obj._row_metadata[label]
|
||
if isinstance(coords, pd.DataFrame):
|
||
partitions = list(coords['partition'])
|
||
indexes = list(coords['index_within_partition'])
|
||
else:
|
||
partitions, indexes = coords
|
||
partitions = [partitions]
|
||
indexes = [indexes]
|
||
|
||
for part, index in zip(partitions, indexes):
|
||
x = _deploy_func.remote(
|
||
lambda df: df.drop(labels=index, axis=axis,
|
||
errors='ignore'),
|
||
obj._row_partitions[part])
|
||
obj._row_partitions = \
|
||
[obj._row_partitions[i] if i != part
|
||
else x
|
||
for i in range(len(obj._row_partitions))]
|
||
|
||
# The decrement here is because we're dropping one at a
|
||
# time and the index is automatically updated when we
|
||
# convert back to blocks.
|
||
obj._row_metadata.squeeze(part, index)
|
||
|
||
obj._row_metadata.drop(labels=label)
|
||
except KeyError:
|
||
return obj
|
||
else:
|
||
try:
|
||
coords = obj._col_metadata[label]
|
||
if isinstance(coords, pd.DataFrame):
|
||
partitions = list(coords['partition'])
|
||
indexes = list(coords['index_within_partition'])
|
||
else:
|
||
partitions, indexes = coords
|
||
partitions = [partitions]
|
||
indexes = [indexes]
|
||
|
||
for part, index in zip(partitions, indexes):
|
||
x = _deploy_func.remote(
|
||
lambda df: df.drop(labels=index, axis=axis,
|
||
errors='ignore'),
|
||
obj._col_partitions[part])
|
||
obj._col_partitions = \
|
||
[obj._col_partitions[i] if i != part
|
||
else x
|
||
for i in range(len(obj._col_partitions))]
|
||
|
||
# The decrement here is because we're dropping one at a
|
||
# time and the index is automatically updated when we
|
||
# convert back to blocks.
|
||
obj._col_metadata.squeeze(part, index)
|
||
|
||
obj._col_metadata.drop(labels=label)
|
||
except KeyError:
|
||
return obj
|
||
|
||
return obj
|
||
|
||
for axis, labels in axes.items():
|
||
if labels is None:
|
||
continue
|
||
|
||
if is_list_like(labels):
|
||
for label in labels:
|
||
if errors != 'ignore' and label and \
|
||
label not in getattr(self, axis):
|
||
raise ValueError("The label [{}] is not in the [{}]",
|
||
label, axis)
|
||
else:
|
||
obj = drop_helper(obj, axis, label)
|
||
else:
|
||
if errors != 'ignore' and labels and \
|
||
labels not in getattr(self, axis):
|
||
raise ValueError("The label [{}] is not in the [{}]",
|
||
labels, axis)
|
||
else:
|
||
obj = drop_helper(obj, axis, labels)
|
||
|
||
if not inplace:
|
||
return obj
|
||
else:
|
||
self._row_metadata = obj._row_metadata
|
||
self._col_metadata = obj._col_metadata
|
||
self._block_partitions = obj._block_partitions
|
||
|
||
def drop_duplicates(self, subset=None, keep='first', inplace=False):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def duplicated(self, subset=None, keep='first'):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def eq(self, other, axis='columns', level=None):
|
||
"""Checks element-wise that this is equal to other.
|
||
|
||
Args:
|
||
other: A DataFrame or Series or scalar to compare to.
|
||
axis: The axis to perform the eq over.
|
||
level: The Multilevel index level to apply eq over.
|
||
|
||
Returns:
|
||
A new DataFrame filled with Booleans.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.eq, other, axis, level)
|
||
|
||
def equals(self, other):
|
||
"""
|
||
Checks if other DataFrame is elementwise equal to the current one
|
||
|
||
Returns:
|
||
Boolean: True if equal, otherwise False
|
||
"""
|
||
# TODO(kunalgosar): Implement Copartition and use to implement equals
|
||
def helper(df, index, other_series):
|
||
return df.iloc[index['index_within_partition']] \
|
||
.equals(other_series)
|
||
|
||
results = []
|
||
other_partition = None
|
||
other_df = None
|
||
# TODO: Make the appropriate coord df accessor methods for this fxn
|
||
for i, idx in other._row_metadata._coord_df.iterrows():
|
||
if idx['partition'] != other_partition:
|
||
other_df = ray.get(other._row_partitions[idx['partition']])
|
||
other_partition = idx['partition']
|
||
# TODO: group series here into full df partitions to reduce
|
||
# the number of remote calls to helper
|
||
other_series = other_df.iloc[idx['index_within_partition']]
|
||
curr_index = self._row_metadata._coord_df.loc[i]
|
||
curr_df = self._row_partitions[int(curr_index['partition'])]
|
||
results.append(_deploy_func.remote(helper,
|
||
curr_df,
|
||
curr_index,
|
||
other_series))
|
||
|
||
for r in results:
|
||
if not ray.get(r):
|
||
return False
|
||
return True
|
||
|
||
def eval(self, expr, inplace=False, **kwargs):
|
||
"""Evaluate a Python expression as a string using various backends.
|
||
Args:
|
||
expr: The expression to evaluate. This string cannot contain any
|
||
Python statements, only Python expressions.
|
||
|
||
parser: The parser to use to construct the syntax tree from the
|
||
expression. The default of 'pandas' parses code slightly
|
||
different than standard Python. Alternatively, you can parse
|
||
an expression using the 'python' parser to retain strict
|
||
Python semantics. See the enhancing performance documentation
|
||
for more details.
|
||
|
||
engine: The engine used to evaluate the expression.
|
||
|
||
truediv: Whether to use true division, like in Python >= 3
|
||
|
||
local_dict: A dictionary of local variables, taken from locals()
|
||
by default.
|
||
|
||
global_dict: A dictionary of global variables, taken from
|
||
globals() by default.
|
||
|
||
resolvers: A list of objects implementing the __getitem__ special
|
||
method that you can use to inject an additional collection
|
||
of namespaces to use for variable lookup. For example, this is
|
||
used in the query() method to inject the index and columns
|
||
variables that refer to their respective DataFrame instance
|
||
attributes.
|
||
|
||
level: The number of prior stack frames to traverse and add to
|
||
the current scope. Most users will not need to change this
|
||
parameter.
|
||
|
||
target: This is the target object for assignment. It is used when
|
||
there is variable assignment in the expression. If so, then
|
||
target must support item assignment with string keys, and if a
|
||
copy is being returned, it must also support .copy().
|
||
|
||
inplace: If target is provided, and the expression mutates target,
|
||
whether to modify target inplace. Otherwise, return a copy of
|
||
target with the mutation.
|
||
Returns:
|
||
ndarray, numeric scalar, DataFrame, Series
|
||
"""
|
||
self._validate_eval_query(expr, **kwargs)
|
||
|
||
columns = self.columns
|
||
|
||
def eval_helper(df):
|
||
df.columns = columns
|
||
result = df.eval(expr, inplace=False, **kwargs)
|
||
# If result is a series, expr was not an assignment expression.
|
||
if not isinstance(result, pd.Series):
|
||
result.columns = pd.RangeIndex(0, len(result.columns))
|
||
return result
|
||
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
new_rows = _map_partitions(eval_helper, self._row_partitions)
|
||
|
||
result_type = ray.get(_deploy_func.remote(lambda df: type(df),
|
||
new_rows[0]))
|
||
if result_type is pd.Series:
|
||
new_series = pd.concat(ray.get(new_rows), axis=0)
|
||
new_series.index = self.index
|
||
return new_series
|
||
|
||
columns_copy = self._col_metadata._coord_df.copy().T
|
||
columns_copy.eval(expr, inplace=True, **kwargs)
|
||
columns = columns_copy.columns
|
||
|
||
if inplace:
|
||
self._update_inplace(row_partitions=new_rows, columns=columns)
|
||
else:
|
||
return DataFrame(columns=columns, row_partitions=new_rows)
|
||
|
||
def ewm(self, com=None, span=None, halflife=None, alpha=None,
|
||
min_periods=0, freq=None, adjust=True, ignore_na=False, axis=0):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def expanding(self, min_periods=1, freq=None, center=False, axis=0):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def ffill(self, axis=None, inplace=False, limit=None, downcast=None):
|
||
"""Synonym for DataFrame.fillna(method='ffill')
|
||
"""
|
||
new_df = self.fillna(method='ffill',
|
||
axis=axis,
|
||
limit=limit,
|
||
downcast=downcast,
|
||
inplace=inplace)
|
||
if not inplace:
|
||
return new_df
|
||
|
||
def fillna(self, value=None, method=None, axis=None, inplace=False,
|
||
limit=None, downcast=None, **kwargs):
|
||
"""Fill NA/NaN values using the specified method.
|
||
|
||
Args:
|
||
value: Value to use to fill holes. This value cannot be a list.
|
||
|
||
method: Method to use for filling holes in reindexed Series pad.
|
||
ffill: propagate last valid observation forward to next valid
|
||
backfill.
|
||
bfill: use NEXT valid observation to fill gap.
|
||
|
||
axis: 0 or 'index', 1 or 'columns'.
|
||
|
||
inplace: If True, fill in place. Note: this will modify any other
|
||
views on this object.
|
||
|
||
limit: If method is specified, this is the maximum number of
|
||
consecutive NaN values to forward/backward fill. In other
|
||
words, if there is a gap with more than this number of
|
||
consecutive NaNs, it will only be partially filled. If method
|
||
is not specified, this is the maximum number of entries along
|
||
the entire axis where NaNs will be filled. Must be greater
|
||
than 0 if not None.
|
||
|
||
downcast: A dict of item->dtype of what to downcast if possible,
|
||
or the string 'infer' which will try to downcast to an
|
||
appropriate equal type.
|
||
|
||
Returns:
|
||
filled: DataFrame
|
||
"""
|
||
# TODO implement value passed as DataFrame
|
||
if isinstance(value, pd.DataFrame):
|
||
raise NotImplementedError("Passing a DataFrame as the value for "
|
||
"fillna is not yet supported.")
|
||
|
||
inplace = validate_bool_kwarg(inplace, 'inplace')
|
||
|
||
axis = pd.DataFrame()._get_axis_number(axis) \
|
||
if axis is not None \
|
||
else 0
|
||
|
||
if isinstance(value, (list, tuple)):
|
||
raise TypeError('"value" parameter must be a scalar or dict, but '
|
||
'you passed a "{0}"'.format(type(value).__name__))
|
||
if value is None and method is None:
|
||
raise ValueError('must specify a fill method or value')
|
||
if value is not None and method is not None:
|
||
raise ValueError('cannot specify both a fill method and value')
|
||
if method is not None and method not in ['backfill', 'bfill', 'pad',
|
||
'ffill']:
|
||
expecting = 'pad (ffill) or backfill (bfill)'
|
||
msg = 'Invalid fill method. Expecting {expecting}. Got {method}'\
|
||
.format(expecting=expecting, method=method)
|
||
raise ValueError(msg)
|
||
|
||
if inplace:
|
||
new_obj = self
|
||
else:
|
||
new_obj = self.copy()
|
||
|
||
parts, coords_obj = (new_obj._col_partitions,
|
||
new_obj._col_metadata) if axis == 0 else \
|
||
(new_obj._row_partitions,
|
||
new_obj._row_metadata)
|
||
|
||
if isinstance(value, (pd.Series, dict)):
|
||
new_vals = {}
|
||
value = dict(value)
|
||
for val in value:
|
||
# Get the local index for the partition
|
||
try:
|
||
part, index = coords_obj[val]
|
||
# Pandas ignores these errors so we will suppress them too.
|
||
except KeyError:
|
||
continue
|
||
|
||
new_vals[val] = _deploy_func.remote(lambda df: df.fillna(
|
||
value={index: value[val]},
|
||
method=method,
|
||
axis=axis,
|
||
inplace=False,
|
||
limit=limit,
|
||
downcast=downcast,
|
||
**kwargs), parts[part])
|
||
|
||
# Not every partition was changed, so we put everything back that
|
||
# was not changed and update those that were.
|
||
new_parts = [parts[i] if coords_obj.index[i] not in new_vals
|
||
else new_vals[coords_obj.index[i]]
|
||
for i in range(len(parts))]
|
||
else:
|
||
new_parts = _map_partitions(lambda df: df.fillna(
|
||
value=value,
|
||
method=method,
|
||
axis=axis,
|
||
inplace=False,
|
||
limit=limit,
|
||
downcast=downcast,
|
||
**kwargs), parts)
|
||
|
||
if axis == 0:
|
||
new_obj._update_inplace(col_partitions=new_parts,
|
||
columns=self.columns,
|
||
index=self.index)
|
||
else:
|
||
new_obj._update_inplace(row_partitions=new_parts,
|
||
columns=self.columns,
|
||
index=self.index)
|
||
if not inplace:
|
||
return new_obj
|
||
|
||
def filter(self, items=None, like=None, regex=None, axis=None):
|
||
"""Subset rows or columns based on their labels
|
||
|
||
Args:
|
||
items (list): list of labels to subset
|
||
like (string): retain labels where `arg in label == True`
|
||
regex (string): retain labels matching regex input
|
||
axis: axis to filter on
|
||
|
||
Returns:
|
||
A new dataframe with the filter applied.
|
||
"""
|
||
nkw = com._count_not_none(items, like, regex)
|
||
if nkw > 1:
|
||
raise TypeError('Keyword arguments `items`, `like`, or `regex` '
|
||
'are mutually exclusive')
|
||
if nkw == 0:
|
||
raise TypeError('Must pass either `items`, `like`, or `regex`')
|
||
|
||
if axis is None:
|
||
axis = 'columns' # This is the default info axis for dataframes
|
||
|
||
axis = pd.DataFrame()._get_axis_number(axis)
|
||
labels = self.columns if axis else self.index
|
||
|
||
if items is not None:
|
||
bool_arr = labels.isin(items)
|
||
elif like is not None:
|
||
def f(x):
|
||
return like in to_str(x)
|
||
bool_arr = labels.map(f).tolist()
|
||
else:
|
||
def f(x):
|
||
return matcher.search(to_str(x)) is not None
|
||
matcher = re.compile(regex)
|
||
bool_arr = labels.map(f).tolist()
|
||
|
||
if not axis:
|
||
return self[bool_arr]
|
||
return self[self.columns[bool_arr]]
|
||
|
||
def first(self, offset):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def first_valid_index(self):
|
||
"""Return index for first non-NA/null value.
|
||
|
||
Returns:
|
||
scalar: type of index
|
||
"""
|
||
return self._row_metadata.first_valid_index()
|
||
|
||
def floordiv(self, other, axis='columns', level=None, fill_value=None):
|
||
"""Divides this DataFrame against another DataFrame/Series/scalar.
|
||
|
||
Args:
|
||
other: The object to use to apply the divide against this.
|
||
axis: The axis to divide over.
|
||
level: The Multilevel index level to apply divide over.
|
||
fill_value: The value to fill NaNs with.
|
||
|
||
Returns:
|
||
A new DataFrame with the Divide applied.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.floordiv, other, axis, level,
|
||
fill_value)
|
||
|
||
@classmethod
|
||
def from_csv(self, path, header=0, sep=', ', index_col=0,
|
||
parse_dates=True, encoding=None, tupleize_cols=None,
|
||
infer_datetime_format=False):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
@classmethod
|
||
def from_dict(self, data, orient='columns', dtype=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
@classmethod
|
||
def from_items(self, items, columns=None, orient='columns'):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
@classmethod
|
||
def from_records(self, data, index=None, exclude=None, columns=None,
|
||
coerce_float=False, nrows=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def ge(self, other, axis='columns', level=None):
|
||
"""Checks element-wise that this is greater than or equal to other.
|
||
|
||
Args:
|
||
other: A DataFrame or Series or scalar to compare to.
|
||
axis: The axis to perform the gt over.
|
||
level: The Multilevel index level to apply gt over.
|
||
|
||
Returns:
|
||
A new DataFrame filled with Booleans.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.ge, other, axis, level)
|
||
|
||
def get(self, key, default=None):
|
||
"""Get item from object for given key (DataFrame column, Panel
|
||
slice, etc.). Returns default value if not found.
|
||
|
||
Args:
|
||
key (DataFrame column, Panel slice) : the key for which value
|
||
to get
|
||
|
||
Returns:
|
||
value (type of items contained in object) : A value that is
|
||
stored at the key
|
||
"""
|
||
try:
|
||
return self[key]
|
||
except (KeyError, ValueError, IndexError):
|
||
return default
|
||
|
||
def get_dtype_counts(self):
|
||
"""Get the counts of dtypes in this object.
|
||
|
||
Returns:
|
||
The counts of dtypes in this object.
|
||
"""
|
||
return ray.get(_deploy_func.remote(lambda df: df.get_dtype_counts(),
|
||
self._row_partitions[0]))
|
||
|
||
def get_ftype_counts(self):
|
||
"""Get the counts of ftypes in this object.
|
||
|
||
Returns:
|
||
The counts of ftypes in this object.
|
||
"""
|
||
return ray.get(_deploy_func.remote(lambda df: df.get_ftype_counts(),
|
||
self._row_partitions[0]))
|
||
|
||
def get_value(self, index, col, takeable=False):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def get_values(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def gt(self, other, axis='columns', level=None):
|
||
"""Checks element-wise that this is greater than other.
|
||
|
||
Args:
|
||
other: A DataFrame or Series or scalar to compare to.
|
||
axis: The axis to perform the gt over.
|
||
level: The Multilevel index level to apply gt over.
|
||
|
||
Returns:
|
||
A new DataFrame filled with Booleans.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.gt, other, axis, level)
|
||
|
||
def head(self, n=5):
|
||
"""Get the first n rows of the dataframe.
|
||
|
||
Args:
|
||
n (int): The number of rows to return.
|
||
|
||
Returns:
|
||
A new dataframe with the first n rows of the dataframe.
|
||
"""
|
||
if n >= len(self._row_metadata):
|
||
return self.copy()
|
||
|
||
new_dfs = _map_partitions(lambda df: df.head(n),
|
||
self._col_partitions)
|
||
|
||
index = self._row_metadata.index[:n]
|
||
|
||
return DataFrame(col_partitions=new_dfs,
|
||
col_metadata=self._col_metadata,
|
||
index=index)
|
||
|
||
def hist(self, data, column=None, by=None, grid=True, xlabelsize=None,
|
||
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
|
||
sharey=False, figsize=None, layout=None, bins=10, **kwds):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def idxmax(self, axis=0, skipna=True):
|
||
"""Get the index of the first occurrence of the max value of the axis.
|
||
|
||
Args:
|
||
axis (int): Identify the max over the rows (1) or columns (0).
|
||
skipna (bool): Whether or not to skip NA values.
|
||
|
||
Returns:
|
||
A Series with the index for each maximum value for the axis
|
||
specified.
|
||
"""
|
||
if not all([d != np.dtype('O') for d in self.dtypes]):
|
||
raise TypeError(
|
||
"reduction operation 'argmax' not allowed for this dtype")
|
||
|
||
def remote_func(df):
|
||
return df.idxmax(axis=axis, skipna=skipna)
|
||
|
||
internal_indices = self._arithmetic_helper(remote_func, axis)
|
||
# do this to convert internal indices to correct index
|
||
return internal_indices.apply(lambda x: self.index[x])
|
||
|
||
def idxmin(self, axis=0, skipna=True):
|
||
"""Get the index of the first occurrence of the min value of the axis.
|
||
|
||
Args:
|
||
axis (int): Identify the min over the rows (1) or columns (0).
|
||
skipna (bool): Whether or not to skip NA values.
|
||
|
||
Returns:
|
||
A Series with the index for each minimum value for the axis
|
||
specified.
|
||
"""
|
||
if not all([d != np.dtype('O') for d in self.dtypes]):
|
||
raise TypeError(
|
||
"reduction operation 'argmax' not allowed for this dtype")
|
||
|
||
def remote_func(df):
|
||
return df.idxmin(axis=axis, skipna=skipna)
|
||
|
||
internal_indices = self._arithmetic_helper(remote_func, axis)
|
||
# do this to convert internal indices to correct index
|
||
return internal_indices.apply(lambda x: self.index[x])
|
||
|
||
def infer_objects(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
|
||
null_counts=None):
|
||
|
||
def info_helper(df):
|
||
output_buffer = io.StringIO()
|
||
df.info(verbose=verbose,
|
||
buf=output_buffer,
|
||
max_cols=max_cols,
|
||
memory_usage=memory_usage,
|
||
null_counts=null_counts)
|
||
return output_buffer.getvalue()
|
||
|
||
# Combine the per-partition info and split into lines
|
||
result = ''.join(ray.get(_map_partitions(info_helper,
|
||
self._col_partitions)))
|
||
lines = result.split('\n')
|
||
|
||
# Class denoted in info() output
|
||
class_string = '<class \'ray.dataframe.dataframe.DataFrame\'>\n'
|
||
|
||
# Create the Index info() string by parsing self.index
|
||
index_string = self.index.summary() + '\n'
|
||
|
||
# A column header is needed in the inf() output
|
||
col_header = 'Data columns (total {0} columns):\n'.format(
|
||
len(self.columns))
|
||
|
||
# Parse the per-partition values to get the per-column details
|
||
# Find all the lines in the output that start with integers
|
||
prog = re.compile('^[0-9]+.+')
|
||
col_lines = [prog.match(line) for line in lines]
|
||
cols = [c.group(0) for c in col_lines if c is not None]
|
||
# replace the partition columns names with real column names
|
||
columns = ["{0}\t{1}\n".format(self.columns[i],
|
||
cols[i].split(" ", 1)[1])
|
||
for i in range(len(cols))]
|
||
col_string = ''.join(columns) + '\n'
|
||
|
||
# A summary of the dtypes in the dataframe
|
||
dtypes_string = "dtypes: "
|
||
for dtype, count in self.dtypes.value_counts().iteritems():
|
||
dtypes_string += "{0}({1}),".format(dtype, count)
|
||
dtypes_string = dtypes_string[:-1] + '\n'
|
||
|
||
# Compute the memory usage by summing per-partitions return values
|
||
# Parse lines for memory usage number
|
||
prog = re.compile('^memory+.+')
|
||
mems = [prog.match(line) for line in lines]
|
||
mem_vals = [float(re.search(r'\d+', m.group(0)).group())
|
||
for m in mems if m is not None]
|
||
|
||
memory_string = ""
|
||
|
||
if len(mem_vals) != 0:
|
||
# Sum memory usage from each partition
|
||
if memory_usage != 'deep':
|
||
memory_string = 'memory usage: {0}+ bytes'.format(
|
||
sum(mem_vals))
|
||
else:
|
||
memory_string = 'memory usage: {0} bytes'.format(sum(mem_vals))
|
||
|
||
# Combine all the components of the info() output
|
||
result = ''.join([class_string, index_string, col_header,
|
||
col_string, dtypes_string, memory_string])
|
||
|
||
# Write to specified output buffer
|
||
if buf:
|
||
buf.write(result)
|
||
else:
|
||
sys.stdout.write(result)
|
||
|
||
def insert(self, loc, column, value, allow_duplicates=False):
|
||
"""Insert column into DataFrame at specified location.
|
||
|
||
Args:
|
||
loc (int): Insertion index. Must verify 0 <= loc <= len(columns).
|
||
column (hashable object): Label of the inserted column.
|
||
value (int, Series, or array-like): The values to insert.
|
||
allow_duplicates (bool): Whether to allow duplicate column names.
|
||
"""
|
||
if not is_list_like(value):
|
||
value = np.full(len(self.index), value)
|
||
|
||
if len(value) != len(self.index):
|
||
raise ValueError(
|
||
"Length of values does not match length of index")
|
||
if not allow_duplicates and column in self.columns:
|
||
raise ValueError(
|
||
"cannot insert {0}, already exists".format(column))
|
||
if loc > len(self.columns):
|
||
raise IndexError(
|
||
"index {0} is out of bounds for axis 0 with size {1}".format(
|
||
loc, len(self.columns)))
|
||
if loc < 0:
|
||
raise ValueError("unbounded slice")
|
||
|
||
partition, index_within_partition = \
|
||
self._col_metadata.insert(column, loc)
|
||
|
||
# Deploy insert function to specific column partition, and replace that
|
||
# column
|
||
def insert_col_part(df):
|
||
df.insert(index_within_partition, column, value, allow_duplicates)
|
||
return df
|
||
|
||
new_obj = _deploy_func.remote(insert_col_part,
|
||
self._col_partitions[partition])
|
||
new_cols = [self._col_partitions[i]
|
||
if i != partition
|
||
else new_obj
|
||
for i in range(len(self._col_partitions))]
|
||
new_col_names = self.columns.insert(loc, column)
|
||
|
||
self._update_inplace(col_partitions=new_cols, columns=new_col_names)
|
||
|
||
def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
|
||
limit_direction='forward', downcast=None, **kwargs):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def iterrows(self):
|
||
"""Iterate over DataFrame rows as (index, Series) pairs.
|
||
|
||
Note:
|
||
Generators can't be pickeled so from the remote function
|
||
we expand the generator into a list before getting it.
|
||
This is not that ideal.
|
||
|
||
Returns:
|
||
A generator that iterates over the rows of the frame.
|
||
"""
|
||
def update_iterrow(series, i):
|
||
"""Helper function to correct the columns + name of the Series."""
|
||
series.index = self.columns
|
||
series.name = list(self.index)[i]
|
||
return series
|
||
|
||
iters = ray.get([_deploy_func.remote(
|
||
lambda df: list(df.iterrows()), part)
|
||
for part in self._row_partitions])
|
||
iters = itertools.chain.from_iterable(iters)
|
||
series = map(lambda s: update_iterrow(s[1][1], s[0]), enumerate(iters))
|
||
|
||
return zip(self.index, series)
|
||
|
||
def items(self):
|
||
"""Iterator over (column name, Series) pairs.
|
||
|
||
Note:
|
||
Generators can't be pickeled so from the remote function
|
||
we expand the generator into a list before getting it.
|
||
This is not that ideal.
|
||
|
||
Returns:
|
||
A generator that iterates over the columns of the frame.
|
||
"""
|
||
iters = ray.get([_deploy_func.remote(
|
||
lambda df: list(df.items()), part)
|
||
for part in self._row_partitions])
|
||
|
||
def concat_iters(iterables):
|
||
for partitions in enumerate(zip(*iterables)):
|
||
series = pd.concat([_series for _, _series in partitions[1]])
|
||
series.index = self.index
|
||
series.name = list(self.columns)[partitions[0]]
|
||
yield (series.name, series)
|
||
|
||
return concat_iters(iters)
|
||
|
||
def iteritems(self):
|
||
"""Iterator over (column name, Series) pairs.
|
||
|
||
Note:
|
||
Returns the same thing as .items()
|
||
|
||
Returns:
|
||
A generator that iterates over the columns of the frame.
|
||
"""
|
||
return self.items()
|
||
|
||
def itertuples(self, index=True, name='Pandas'):
|
||
"""Iterate over DataFrame rows as namedtuples.
|
||
|
||
Args:
|
||
index (boolean, default True): If True, return the index as the
|
||
first element of the tuple.
|
||
name (string, default "Pandas"): The name of the returned
|
||
namedtuples or None to return regular tuples.
|
||
Note:
|
||
Generators can't be pickeled so from the remote function
|
||
we expand the generator into a list before getting it.
|
||
This is not that ideal.
|
||
|
||
Returns:
|
||
A tuple representing row data. See args for varying tuples.
|
||
"""
|
||
iters = ray.get([
|
||
_deploy_func.remote(
|
||
lambda df: list(df.itertuples(index=index, name=name)),
|
||
part) for part in self._row_partitions])
|
||
iters = itertools.chain.from_iterable(iters)
|
||
|
||
def _replace_index(row_tuple, idx):
|
||
# We need to use try-except here because
|
||
# isinstance(row_tuple, namedtuple) won't work.
|
||
try:
|
||
row_tuple = row_tuple._replace(Index=idx)
|
||
except AttributeError: # Tuple not namedtuple
|
||
row_tuple = (idx,) + row_tuple[1:]
|
||
return row_tuple
|
||
|
||
if index:
|
||
iters = itertools.starmap(_replace_index, zip(iters, self.index))
|
||
return iters
|
||
|
||
def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
|
||
sort=False):
|
||
"""Join two or more DataFrames, or a DataFrame with a collection.
|
||
|
||
Args:
|
||
other: What to join this DataFrame with.
|
||
on: A column name to use from the left for the join.
|
||
how: What type of join to conduct.
|
||
lsuffix: The suffix to add to column names that match on left.
|
||
rsuffix: The suffix to add to column names that match on right.
|
||
sort: Whether or not to sort.
|
||
|
||
Returns:
|
||
The joined DataFrame.
|
||
"""
|
||
|
||
if on is not None:
|
||
raise NotImplementedError("Not yet.")
|
||
|
||
if isinstance(other, pd.Series):
|
||
if other.name is None:
|
||
raise ValueError("Other Series must have a name")
|
||
other = DataFrame({other.name: other})
|
||
|
||
if isinstance(other, DataFrame):
|
||
if on is not None:
|
||
index = self[on]
|
||
else:
|
||
index = self.index
|
||
|
||
new_index = index.join(other.index, how=how, sort=sort)
|
||
|
||
# Joining two empty DataFrames is fast, and error checks for us.
|
||
new_column_labels = pd.DataFrame(columns=self.columns) \
|
||
.join(pd.DataFrame(columns=other.columns),
|
||
lsuffix=lsuffix, rsuffix=rsuffix).columns
|
||
|
||
new_partition_num = max(len(self._block_partitions.T),
|
||
len(other._block_partitions.T))
|
||
|
||
# Join is a concat once we have shuffled the data internally.
|
||
# We shuffle the data by computing the correct order.
|
||
# Another important thing to note: We set the current self index
|
||
# to the index variable which may be 'on'.
|
||
new_self = np.array([
|
||
_reindex_helper._submit(args=tuple([index, new_index, 1,
|
||
new_partition_num] +
|
||
block.tolist()),
|
||
num_return_vals=new_partition_num)
|
||
for block in self._block_partitions.T])
|
||
new_other = np.array([
|
||
_reindex_helper._submit(args=tuple([other.index, new_index, 1,
|
||
new_partition_num] +
|
||
block.tolist()),
|
||
num_return_vals=new_partition_num)
|
||
for block in other._block_partitions.T])
|
||
|
||
# Append the blocks together (i.e. concat)
|
||
new_block_parts = np.concatenate((new_self, new_other)).T
|
||
|
||
# Default index in the case that on is set.
|
||
if on is not None:
|
||
new_index = None
|
||
|
||
# TODO join the two metadata tables for performance.
|
||
return DataFrame(block_partitions=new_block_parts,
|
||
index=new_index,
|
||
columns=new_column_labels)
|
||
else:
|
||
# This constraint carried over from Pandas.
|
||
if on is not None:
|
||
raise ValueError("Joining multiple DataFrames only supported"
|
||
" for joining on index")
|
||
|
||
# Joining the empty DataFrames with either index or columns is
|
||
# fast. It gives us proper error checking for the edge cases that
|
||
# would otherwise require a lot more logic.
|
||
new_index = pd.DataFrame(index=self.index).join(
|
||
[pd.DataFrame(index=obj.index) for obj in other],
|
||
how=how, sort=sort).index
|
||
|
||
new_column_labels = pd.DataFrame(columns=self.columns).join(
|
||
[pd.DataFrame(columns=obj.columns) for obj in other],
|
||
lsuffix=lsuffix, rsuffix=rsuffix).columns
|
||
|
||
new_partition_num = max([len(self._block_partitions.T)] +
|
||
[len(obj._block_partitions.T)
|
||
for obj in other])
|
||
|
||
new_self = np.array([
|
||
_reindex_helper._submit(args=tuple([self.index, new_index, 1,
|
||
new_partition_num] +
|
||
block.tolist()),
|
||
num_return_vals=new_partition_num)
|
||
for block in self._block_partitions.T])
|
||
|
||
new_others = np.array([_reindex_helper._submit(
|
||
args=tuple([obj.index, new_index, 1, new_partition_num] +
|
||
block.tolist()),
|
||
num_return_vals=new_partition_num
|
||
) for obj in other for block in obj._block_partitions.T])
|
||
|
||
# Append the columns together (i.e. concat)
|
||
new_block_parts = np.concatenate((new_self, new_others)).T
|
||
|
||
# TODO join the two metadata tables for performance.
|
||
return DataFrame(block_partitions=new_block_parts,
|
||
index=new_index,
|
||
columns=new_column_labels)
|
||
|
||
def kurt(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||
**kwargs):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||
**kwargs):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def last(self, offset):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def last_valid_index(self):
|
||
"""Return index for last non-NA/null value.
|
||
|
||
Returns:
|
||
scalar: type of index
|
||
"""
|
||
return self._row_metadata.last_valid_index()
|
||
|
||
def le(self, other, axis='columns', level=None):
|
||
"""Checks element-wise that this is less than or equal to other.
|
||
|
||
Args:
|
||
other: A DataFrame or Series or scalar to compare to.
|
||
axis: The axis to perform the le over.
|
||
level: The Multilevel index level to apply le over.
|
||
|
||
Returns:
|
||
A new DataFrame filled with Booleans.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.le, other, axis, level)
|
||
|
||
def lookup(self, row_labels, col_labels):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def lt(self, other, axis='columns', level=None):
|
||
"""Checks element-wise that this is less than other.
|
||
|
||
Args:
|
||
other: A DataFrame or Series or scalar to compare to.
|
||
axis: The axis to perform the lt over.
|
||
level: The Multilevel index level to apply lt over.
|
||
|
||
Returns:
|
||
A new DataFrame filled with Booleans.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.lt, other, axis, level)
|
||
|
||
def mad(self, axis=None, skipna=None, level=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None,
|
||
errors='raise', try_cast=False, raise_on_error=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def max(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||
**kwargs):
|
||
"""Perform max across the DataFrame.
|
||
|
||
Args:
|
||
axis (int): The axis to take the max on.
|
||
skipna (bool): True to skip NA values, false otherwise.
|
||
|
||
Returns:
|
||
The max of the DataFrame.
|
||
"""
|
||
def remote_func(df):
|
||
return df.max(axis=axis, skipna=skipna, level=level,
|
||
numeric_only=numeric_only, **kwargs)
|
||
|
||
return self._arithmetic_helper(remote_func, axis, level)
|
||
|
||
def mean(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||
**kwargs):
|
||
"""Computes mean across the DataFrame.
|
||
|
||
Args:
|
||
axis (int): The axis to take the mean on.
|
||
skipna (bool): True to skip NA values, false otherwise.
|
||
|
||
Returns:
|
||
The mean of the DataFrame. (Pandas series)
|
||
"""
|
||
def remote_func(df):
|
||
return df.mean(axis=axis, skipna=skipna, level=level,
|
||
numeric_only=numeric_only, **kwargs)
|
||
|
||
return self._arithmetic_helper(remote_func, axis, level)
|
||
|
||
def median(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||
**kwargs):
|
||
"""Computes median across the DataFrame.
|
||
|
||
Args:
|
||
axis (int): The axis to take the median on.
|
||
skipna (bool): True to skip NA values, false otherwise.
|
||
|
||
Returns:
|
||
The median of the DataFrame. (Pandas series)
|
||
"""
|
||
def remote_func(df):
|
||
return df.median(axis=axis, skipna=skipna, level=level,
|
||
numeric_only=numeric_only, **kwargs)
|
||
|
||
return self._arithmetic_helper(remote_func, axis, level)
|
||
|
||
def melt(self, id_vars=None, value_vars=None, var_name=None,
|
||
value_name='value', col_level=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def memory_usage(self, index=True, deep=False):
|
||
|
||
def remote_func(df):
|
||
return df.memory_usage(index=False, deep=deep)
|
||
|
||
result = self._arithmetic_helper(remote_func, axis=0)
|
||
|
||
result.index = self.columns
|
||
if index:
|
||
index_value = self._row_metadata.index.memory_usage(deep=deep)
|
||
return pd.Series(index_value, index=['Index']).append(result)
|
||
|
||
return result
|
||
|
||
def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
|
||
left_index=False, right_index=False, sort=False,
|
||
suffixes=('_x', '_y'), copy=True, indicator=False,
|
||
validate=None):
|
||
"""Database style join, where common columns in "on" are merged.
|
||
|
||
Args:
|
||
right: The DataFrame to merge against.
|
||
how: What type of join to use.
|
||
on: The common column name(s) to join on. If None, and left_on and
|
||
right_on are also None, will default to all commonly named
|
||
columns.
|
||
left_on: The column(s) on the left to use for the join.
|
||
right_on: The column(s) on the right to use for the join.
|
||
left_index: Use the index from the left as the join keys.
|
||
right_index: Use the index from the right as the join keys.
|
||
sort: Sort the join keys lexicographically in the result.
|
||
suffixes: Add this suffix to the common names not in the "on".
|
||
copy: Does nothing in our implementation
|
||
indicator: Adds a column named _merge to the DataFrame with
|
||
metadata from the merge about each row.
|
||
validate: Checks if merge is a specific type.
|
||
|
||
Returns:
|
||
A merged Dataframe
|
||
"""
|
||
|
||
if not isinstance(right, DataFrame):
|
||
raise ValueError("can not merge DataFrame with instance of type "
|
||
"{}".format(type(right)))
|
||
|
||
args = (how, on, left_on, right_on, left_index, right_index, sort,
|
||
suffixes, False, indicator, validate)
|
||
|
||
left_cols = ray.put(self.columns)
|
||
right_cols = ray.put(right.columns)
|
||
|
||
# This can be put in a remote function because we don't need it until
|
||
# the end, and the columns can be built asynchronously. This takes the
|
||
# columns defining off the critical path and speeds up the overall
|
||
# merge.
|
||
new_columns = _merge_columns.remote(left_cols, right_cols, *args)
|
||
|
||
if on is not None:
|
||
if left_on is not None or right_on is not None:
|
||
raise MergeError("Can only pass argument \"on\" OR \"left_on\""
|
||
" and \"right_on\", not a combination of "
|
||
"both.")
|
||
if not is_list_like(on):
|
||
on = [on]
|
||
|
||
if next((True for key in on if key not in self), False) or \
|
||
next((True for key in on if key not in right), False):
|
||
|
||
missing_key = \
|
||
next((str(key) for key in on if key not in self), "") + \
|
||
next((str(key) for key in on if key not in right), "")
|
||
raise KeyError(missing_key)
|
||
|
||
elif right_on is not None or right_index is True:
|
||
if left_on is None and left_index is False:
|
||
# Note: This is not the same error as pandas, but pandas throws
|
||
# a ValueError NoneType has no len(), and I don't think that
|
||
# helps enough.
|
||
raise TypeError("left_on must be specified or left_index must "
|
||
"be true if right_on is specified.")
|
||
|
||
elif left_on is not None or left_index is True:
|
||
if right_on is None and right_index is False:
|
||
# Note: See note above about TypeError.
|
||
raise TypeError("right_on must be specified or right_index "
|
||
"must be true if right_on is specified.")
|
||
|
||
if left_on is not None:
|
||
if not is_list_like(left_on):
|
||
left_on = [left_on]
|
||
|
||
if next((True for key in left_on if key not in self), False):
|
||
raise KeyError(next(key for key in left_on
|
||
if key not in self))
|
||
|
||
if right_on is not None:
|
||
if not is_list_like(right_on):
|
||
right_on = [right_on]
|
||
|
||
if next((True for key in right_on if key not in right), False):
|
||
raise KeyError(next(key for key in right_on
|
||
if key not in right))
|
||
|
||
# There's a small chance that our partitions are already perfect, but
|
||
# if it's not, we need to adjust them. We adjust the right against the
|
||
# left because the defaults of merge rely on the order of the left. We
|
||
# have to push the index down here, so if we're joining on the right's
|
||
# index we go ahead and push it down here too.
|
||
if not np.array_equal(self._row_metadata._lengths,
|
||
right._row_metadata._lengths) or right_index:
|
||
|
||
repartitioned_right = np.array([_match_partitioning._submit(
|
||
args=(df, self._row_metadata._lengths, right.index),
|
||
num_return_vals=len(self._row_metadata._lengths))
|
||
for df in right._col_partitions]).T
|
||
else:
|
||
repartitioned_right = right._block_partitions
|
||
|
||
if not left_index and not right_index:
|
||
# Passing None to each call specifies that we don't care about the
|
||
# left's index for the join.
|
||
left_idx = itertools.repeat(None)
|
||
|
||
# We only return the index if we need to update it, and that only
|
||
# happens when either left_index or right_index is True. We will
|
||
# use this value to add the return vals if we are getting an index
|
||
# back.
|
||
return_index = False
|
||
else:
|
||
# We build this to push the index down so that we can use it for
|
||
# the join.
|
||
left_idx = \
|
||
(v.index for k, v in
|
||
self._row_metadata._coord_df.copy().groupby('partition'))
|
||
return_index = True
|
||
|
||
new_blocks = \
|
||
np.array([_co_op_helper._submit(
|
||
args=tuple([lambda x, y: x.merge(y, *args),
|
||
left_cols, right_cols,
|
||
len(self._block_partitions.T), next(left_idx)] +
|
||
np.concatenate(obj).tolist()),
|
||
num_return_vals=len(self._block_partitions.T) + return_index)
|
||
for obj in zip(self._block_partitions,
|
||
repartitioned_right)])
|
||
|
||
if not return_index:
|
||
# Default to RangeIndex if left_index and right_index both false.
|
||
new_index = None
|
||
else:
|
||
new_index_parts = new_blocks[:, -1]
|
||
new_index = _concat_index.remote(*new_index_parts)
|
||
new_blocks = new_blocks[:, :-1]
|
||
|
||
return DataFrame(block_partitions=new_blocks,
|
||
columns=new_columns,
|
||
index=new_index)
|
||
|
||
def min(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||
**kwargs):
|
||
"""Perform min across the DataFrame.
|
||
|
||
Args:
|
||
axis (int): The axis to take the min on.
|
||
skipna (bool): True to skip NA values, false otherwise.
|
||
|
||
Returns:
|
||
The min of the DataFrame.
|
||
"""
|
||
def remote_func(df):
|
||
return df.min(axis=axis, skipna=skipna, level=level,
|
||
numeric_only=numeric_only, **kwargs)
|
||
|
||
return self._arithmetic_helper(remote_func, axis, level)
|
||
|
||
def mod(self, other, axis='columns', level=None, fill_value=None):
|
||
"""Mods this DataFrame against another DataFrame/Series/scalar.
|
||
|
||
Args:
|
||
other: The object to use to apply the mod against this.
|
||
axis: The axis to mod over.
|
||
level: The Multilevel index level to apply mod over.
|
||
fill_value: The value to fill NaNs with.
|
||
|
||
Returns:
|
||
A new DataFrame with the Mod applied.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.mod, other, axis, level,
|
||
fill_value)
|
||
|
||
def mode(self, axis=0, numeric_only=False):
|
||
"""Perform mode across the DataFrame.
|
||
|
||
Args:
|
||
axis (int): The axis to take the mode on.
|
||
numeric_only (bool): if True, only apply to numeric columns.
|
||
|
||
Returns:
|
||
DataFrame: The mode of the DataFrame.
|
||
"""
|
||
axis = pd.DataFrame()._get_axis_number(axis)
|
||
|
||
def mode_helper(df):
|
||
mode_df = df.mode(axis=axis, numeric_only=numeric_only)
|
||
return mode_df, mode_df.shape[axis]
|
||
|
||
def fix_length(df, *lengths):
|
||
max_len = max(lengths[0])
|
||
df = df.reindex(pd.RangeIndex(max_len), axis=axis)
|
||
return df
|
||
|
||
parts = self._col_partitions if axis == 0 else self._row_partitions
|
||
|
||
result = [_deploy_func._submit(args=(lambda df: mode_helper(df),
|
||
part), num_return_vals=2)
|
||
for part in parts]
|
||
|
||
parts, lengths = [list(t) for t in zip(*result)]
|
||
|
||
parts = [_deploy_func.remote(
|
||
lambda df, *l: fix_length(df, l), part, *lengths)
|
||
for part in parts]
|
||
|
||
if axis == 0:
|
||
return DataFrame(col_partitions=parts,
|
||
columns=self.columns)
|
||
else:
|
||
return DataFrame(row_partitions=parts,
|
||
index=self.index)
|
||
|
||
def mul(self, other, axis='columns', level=None, fill_value=None):
|
||
"""Multiplies this DataFrame against another DataFrame/Series/scalar.
|
||
|
||
Args:
|
||
other: The object to use to apply the multiply against this.
|
||
axis: The axis to multiply over.
|
||
level: The Multilevel index level to apply multiply over.
|
||
fill_value: The value to fill NaNs with.
|
||
|
||
Returns:
|
||
A new DataFrame with the Multiply applied.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.mul, other, axis, level,
|
||
fill_value)
|
||
|
||
def multiply(self, other, axis='columns', level=None, fill_value=None):
|
||
"""Synonym for mul.
|
||
|
||
Args:
|
||
other: The object to use to apply the multiply against this.
|
||
axis: The axis to multiply over.
|
||
level: The Multilevel index level to apply multiply over.
|
||
fill_value: The value to fill NaNs with.
|
||
|
||
Returns:
|
||
A new DataFrame with the Multiply applied.
|
||
"""
|
||
return self.mul(other, axis, level, fill_value)
|
||
|
||
def ne(self, other, axis='columns', level=None):
|
||
"""Checks element-wise that this is not equal to other.
|
||
|
||
Args:
|
||
other: A DataFrame or Series or scalar to compare to.
|
||
axis: The axis to perform the ne over.
|
||
level: The Multilevel index level to apply ne over.
|
||
|
||
Returns:
|
||
A new DataFrame filled with Booleans.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.ne, other, axis, level)
|
||
|
||
def nlargest(self, n, columns, keep='first'):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def notna(self):
|
||
"""Perform notna across the DataFrame.
|
||
|
||
Args:
|
||
None
|
||
|
||
Returns:
|
||
Boolean DataFrame where value is False if corresponding
|
||
value is NaN, True otherwise
|
||
"""
|
||
new_block_partitions = np.array([_map_partitions(
|
||
lambda df: df.notna(), block) for block in self._block_partitions])
|
||
|
||
return DataFrame(block_partitions=new_block_partitions,
|
||
row_metadata=self._row_metadata,
|
||
col_metadata=self._col_metadata)
|
||
|
||
def notnull(self):
|
||
"""Perform notnull across the DataFrame.
|
||
|
||
Args:
|
||
None
|
||
|
||
Returns:
|
||
Boolean DataFrame where value is False if corresponding
|
||
value is NaN, True otherwise
|
||
"""
|
||
new_block_partitions = np.array([_map_partitions(
|
||
lambda df: df.notnull(), block)
|
||
for block in self._block_partitions])
|
||
|
||
return DataFrame(block_partitions=new_block_partitions,
|
||
row_metadata=self._row_metadata,
|
||
col_metadata=self._col_metadata)
|
||
|
||
def nsmallest(self, n, columns, keep='first'):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def nunique(self, axis=0, dropna=True):
|
||
"""Return Series with number of distinct
|
||
observations over requested axis.
|
||
|
||
Args:
|
||
axis : {0 or ‘index’, 1 or ‘columns’}, default 0
|
||
dropna : boolean, default True
|
||
|
||
Returns:
|
||
nunique : Series
|
||
"""
|
||
def remote_func(df):
|
||
return df.nunique(axis=axis, dropna=dropna)
|
||
|
||
return self._arithmetic_helper(remote_func, axis)
|
||
|
||
def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
|
||
**kwargs):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def pipe(self, func, *args, **kwargs):
|
||
"""Apply func(self, *args, **kwargs)
|
||
|
||
Args:
|
||
func: function to apply to the df.
|
||
args: positional arguments passed into ``func``.
|
||
kwargs: a dictionary of keyword arguments passed into ``func``.
|
||
|
||
Returns:
|
||
object: the return type of ``func``.
|
||
"""
|
||
return com._pipe(self, func, *args, **kwargs)
|
||
|
||
def pivot(self, index=None, columns=None, values=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def pivot_table(self, values=None, index=None, columns=None,
|
||
aggfunc='mean', fill_value=None, margins=False,
|
||
dropna=True, margins_name='All'):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def plot(self, x=None, y=None, kind='line', ax=None, subplots=False,
|
||
sharex=None, sharey=False, layout=None, figsize=None,
|
||
use_index=True, title=None, grid=None, legend=True, style=None,
|
||
logx=False, logy=False, loglog=False, xticks=None, yticks=None,
|
||
xlim=None, ylim=None, rot=None, fontsize=None, colormap=None,
|
||
table=False, yerr=None, xerr=None, secondary_y=False,
|
||
sort_columns=False, **kwds):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def pop(self, item):
|
||
"""Pops an item from this DataFrame and returns it.
|
||
|
||
Args:
|
||
item (str): Column label to be popped
|
||
|
||
Returns:
|
||
A Series containing the popped values. Also modifies this
|
||
DataFrame.
|
||
"""
|
||
result = self[item]
|
||
del self[item]
|
||
return result
|
||
|
||
def pow(self, other, axis='columns', level=None, fill_value=None):
|
||
"""Pow this DataFrame against another DataFrame/Series/scalar.
|
||
|
||
Args:
|
||
other: The object to use to apply the pow against this.
|
||
axis: The axis to pow over.
|
||
level: The Multilevel index level to apply pow over.
|
||
fill_value: The value to fill NaNs with.
|
||
|
||
Returns:
|
||
A new DataFrame with the Pow applied.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.pow, other, axis, level,
|
||
fill_value)
|
||
|
||
def prod(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||
min_count=1, **kwargs):
|
||
"""Return the product of the values for the requested axis
|
||
|
||
Args:
|
||
axis : {index (0), columns (1)}
|
||
skipna : boolean, default True
|
||
level : int or level name, default None
|
||
numeric_only : boolean, default None
|
||
min_count : int, default 1
|
||
|
||
Returns:
|
||
prod : Series or DataFrame (if level specified)
|
||
"""
|
||
def remote_func(df):
|
||
return df.prod(axis=axis, skipna=skipna, level=level,
|
||
numeric_only=numeric_only, min_count=min_count,
|
||
**kwargs)
|
||
|
||
return self._arithmetic_helper(remote_func, axis, level)
|
||
|
||
def product(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||
min_count=1, **kwargs):
|
||
"""Return the product of the values for the requested axis
|
||
|
||
Args:
|
||
axis : {index (0), columns (1)}
|
||
skipna : boolean, default True
|
||
level : int or level name, default None
|
||
numeric_only : boolean, default None
|
||
min_count : int, default 1
|
||
|
||
Returns:
|
||
product : Series or DataFrame (if level specified)
|
||
"""
|
||
return self.prod(axis=axis, skipna=skipna, level=level,
|
||
numeric_only=numeric_only, min_count=min_count,
|
||
**kwargs)
|
||
|
||
def quantile(self, q=0.5, axis=0, numeric_only=True,
|
||
interpolation='linear'):
|
||
"""Return values at the given quantile over requested axis,
|
||
a la numpy.percentile.
|
||
|
||
Args:
|
||
q (float): 0 <= q <= 1, the quantile(s) to compute
|
||
axis (int): 0 or 'index' for row-wise,
|
||
1 or 'columns' for column-wise
|
||
interpolation: {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
|
||
Specifies which interpolation method to use
|
||
|
||
Returns:
|
||
quantiles : Series or DataFrame
|
||
If q is an array, a DataFrame will be returned where the
|
||
index is q, the columns are the columns of self, and the
|
||
values are the quantiles.
|
||
|
||
If q is a float, a Series will be returned where the
|
||
index is the columns of self and the values
|
||
are the quantiles.
|
||
"""
|
||
|
||
def check_bad_dtype(t):
|
||
return t == np.dtype('O') or is_timedelta64_dtype(t)
|
||
|
||
if not numeric_only:
|
||
# check if there are any object columns
|
||
if all(check_bad_dtype(t) for t in self.dtypes):
|
||
raise TypeError("can't multiply sequence by non-int of type "
|
||
"'float'")
|
||
else:
|
||
if next((True for t in self.dtypes if check_bad_dtype(t)),
|
||
False):
|
||
dtype = next(t for t in self.dtypes if check_bad_dtype(t))
|
||
raise ValueError("Cannot compare type '{}' with type '{}'"
|
||
.format(type(dtype), float))
|
||
else:
|
||
# Normally pandas returns this near the end of the quantile, but we
|
||
# can't afford the overhead of running the entire operation before
|
||
# we error.
|
||
if all(check_bad_dtype(t) for t in self.dtypes):
|
||
raise ValueError("need at least one array to concatenate")
|
||
|
||
# check that all qs are between 0 and 1
|
||
pd.DataFrame()._check_percentile(q)
|
||
|
||
def quantile_helper(df, base_object):
|
||
"""Quantile to be run inside each partitoin.
|
||
|
||
Args:
|
||
df: The DataFrame composing the partition.
|
||
base_object: An empty pd.Series or pd.DataFrame depending on q.
|
||
|
||
Returns:
|
||
A new Series or DataFrame depending on q.
|
||
"""
|
||
# This if call prevents ValueErrors with object only partitions
|
||
if (numeric_only and
|
||
all([dtype == np.dtype('O') or
|
||
is_timedelta64_dtype(dtype)
|
||
for dtype in df.dtypes])):
|
||
return base_object
|
||
else:
|
||
return df.quantile(q=q, axis=axis, numeric_only=numeric_only,
|
||
interpolation=interpolation)
|
||
|
||
axis = pd.DataFrame()._get_axis_number(axis)
|
||
|
||
if isinstance(q, (pd.Series, np.ndarray, pd.Index, list)):
|
||
|
||
q_index = pd.Float64Index(q)
|
||
|
||
if axis == 0:
|
||
new_partitions = _map_partitions(
|
||
lambda df: quantile_helper(df, pd.DataFrame()),
|
||
self._col_partitions)
|
||
|
||
# select only correct dtype columns
|
||
new_columns = self.dtypes[self.dtypes.apply(
|
||
lambda x: is_numeric_dtype(x))].index
|
||
|
||
else:
|
||
new_partitions = _map_partitions(
|
||
lambda df: quantile_helper(df, pd.DataFrame()),
|
||
self._row_partitions)
|
||
new_columns = self.index
|
||
|
||
return DataFrame(col_partitions=new_partitions,
|
||
index=q_index,
|
||
columns=new_columns)
|
||
|
||
else:
|
||
# When q is a single float, we return a Series, so using
|
||
# arithmetic_helper works well here.
|
||
result = self._arithmetic_helper(
|
||
lambda df: quantile_helper(df, pd.Series()), axis)
|
||
result.name = q
|
||
return result
|
||
|
||
def query(self, expr, inplace=False, **kwargs):
|
||
"""Queries the Dataframe with a boolean expression
|
||
|
||
Returns:
|
||
A new DataFrame if inplace=False
|
||
"""
|
||
self._validate_eval_query(expr, **kwargs)
|
||
|
||
columns = self.columns
|
||
|
||
def query_helper(df):
|
||
df = df.copy()
|
||
df.columns = columns
|
||
df.query(expr, inplace=True, **kwargs)
|
||
df.columns = pd.RangeIndex(0, len(df.columns))
|
||
return df
|
||
|
||
new_rows = _map_partitions(query_helper,
|
||
self._row_partitions)
|
||
|
||
if inplace:
|
||
self._update_inplace(row_partitions=new_rows)
|
||
else:
|
||
return DataFrame(row_partitions=new_rows,
|
||
col_metadata=self._col_metadata)
|
||
|
||
def radd(self, other, axis='columns', level=None, fill_value=None):
|
||
return self.add(other, axis, level, fill_value)
|
||
|
||
def rank(self, axis=0, method='average', numeric_only=None,
|
||
na_option='keep', ascending=True, pct=False):
|
||
|
||
"""
|
||
Compute numerical data ranks (1 through n) along axis.
|
||
Equal values are assigned a rank that is the [method] of
|
||
the ranks of those values.
|
||
|
||
Args:
|
||
axis (int): 0 or 'index' for row-wise,
|
||
1 or 'columns' for column-wise
|
||
interpolation: {‘average’, ‘min’, ‘max’, ‘first’, ‘dense’}
|
||
Specifies which method to use for equal vals
|
||
numeric_only (boolean)
|
||
Include only float, int, boolean data.
|
||
na_option: {'keep', 'top', 'bottom'}
|
||
Specifies how to handle NA options
|
||
ascending (boolean):
|
||
Decedes ranking order
|
||
pct (boolean):
|
||
Computes percentage ranking of data
|
||
Returns:
|
||
A new DataFrame
|
||
"""
|
||
|
||
def rank_helper(df):
|
||
return df.rank(axis=axis, method=method,
|
||
numeric_only=numeric_only,
|
||
na_option=na_option,
|
||
ascending=ascending, pct=pct)
|
||
|
||
axis = pd.DataFrame()._get_axis_number(axis)
|
||
|
||
if (axis == 1):
|
||
new_cols = self.dtypes[self.dtypes.apply(
|
||
lambda x: is_numeric_dtype(x))].index
|
||
result = _map_partitions(rank_helper,
|
||
self._row_partitions)
|
||
return DataFrame(row_partitions=result,
|
||
columns=new_cols,
|
||
index=self.index)
|
||
|
||
if (axis == 0):
|
||
result = _map_partitions(rank_helper,
|
||
self._col_partitions)
|
||
return DataFrame(col_partitions=result,
|
||
columns=self.columns,
|
||
index=self.index)
|
||
|
||
def rdiv(self, other, axis='columns', level=None, fill_value=None):
|
||
return self._single_df_op_helper(
|
||
lambda df: df.rdiv(other, axis, level, fill_value),
|
||
other, axis, level)
|
||
|
||
def reindex(self, labels=None, index=None, columns=None, axis=None,
|
||
method=None, copy=True, level=None, fill_value=np.nan,
|
||
limit=None, tolerance=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
|
||
limit=None, fill_value=np.nan):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def reindex_like(self, other, method=None, copy=True, limit=None,
|
||
tolerance=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def rename(self, mapper=None, index=None, columns=None, axis=None,
|
||
copy=True, inplace=False, level=None):
|
||
"""Alters axes labels.
|
||
|
||
Args:
|
||
mapper, index, columns: Transformations to apply to the axis's
|
||
values.
|
||
axis: Axis to target with mapper.
|
||
copy: Also copy underlying data.
|
||
inplace: Whether to return a new DataFrame.
|
||
level: Only rename a specific level of a MultiIndex.
|
||
|
||
Returns:
|
||
If inplace is False, a new DataFrame with the updated axes.
|
||
"""
|
||
inplace = validate_bool_kwarg(inplace, 'inplace')
|
||
|
||
# We have to do this with the args because of how rename handles
|
||
# kwargs. It doesn't ignore None values passed in, so we have to filter
|
||
# them ourselves.
|
||
args = locals()
|
||
kwargs = {k: v for k, v in args.items()
|
||
if v is not None and k != "self"}
|
||
# inplace should always be true because this is just a copy, and we
|
||
# will use the results after.
|
||
kwargs['inplace'] = True
|
||
|
||
df_to_rename = pd.DataFrame(index=self.index, columns=self.columns)
|
||
df_to_rename.rename(**kwargs)
|
||
|
||
if inplace:
|
||
obj = self
|
||
else:
|
||
obj = self.copy()
|
||
|
||
obj.index = df_to_rename.index
|
||
obj.columns = df_to_rename.columns
|
||
|
||
if not inplace:
|
||
return obj
|
||
|
||
def rename_axis(self, mapper, axis=0, copy=True, inplace=False):
|
||
axes_is_columns = axis == 1 or axis == "columns"
|
||
renamed = self if inplace else self.copy()
|
||
if axes_is_columns:
|
||
renamed.columns.name = mapper
|
||
else:
|
||
renamed.index.name = mapper
|
||
if not inplace:
|
||
return renamed
|
||
|
||
def _set_axis_name(self, name, axis=0, inplace=False):
|
||
"""Alter the name or names of the axis.
|
||
|
||
Args:
|
||
name: Name for the Index, or list of names for the MultiIndex
|
||
axis: 0 or 'index' for the index; 1 or 'columns' for the columns
|
||
inplace: Whether to modify `self` directly or return a copy
|
||
|
||
Returns:
|
||
Type of caller or None if inplace=True.
|
||
"""
|
||
axes_is_columns = axis == 1 or axis == "columns"
|
||
renamed = self if inplace else self.copy()
|
||
if axes_is_columns:
|
||
renamed.columns.set_names(name)
|
||
else:
|
||
renamed.index.set_names(name)
|
||
|
||
if not inplace:
|
||
return renamed
|
||
|
||
def reorder_levels(self, order, axis=0):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def replace(self, to_replace=None, value=None, inplace=False, limit=None,
|
||
regex=False, method='pad', axis=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def resample(self, rule, how=None, axis=0, fill_method=None, closed=None,
|
||
label=None, convention='start', kind=None, loffset=None,
|
||
limit=None, base=0, on=None, level=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
|
||
col_fill=''):
|
||
"""Reset this index to default and create column from current index.
|
||
|
||
Args:
|
||
level: Only remove the given levels from the index. Removes all
|
||
levels by default
|
||
drop: Do not try to insert index into dataframe columns. This
|
||
resets the index to the default integer index.
|
||
inplace: Modify the DataFrame in place (do not create a new object)
|
||
col_level : If the columns have multiple levels, determines which
|
||
level the labels are inserted into. By default it is inserted
|
||
into the first level.
|
||
col_fill: If the columns have multiple levels, determines how the
|
||
other levels are named. If None then the index name is
|
||
repeated.
|
||
|
||
Returns:
|
||
A new DataFrame if inplace is False, None otherwise.
|
||
"""
|
||
inplace = validate_bool_kwarg(inplace, 'inplace')
|
||
if inplace:
|
||
new_obj = self
|
||
else:
|
||
new_obj = self.copy()
|
||
|
||
def _maybe_casted_values(index, labels=None):
|
||
if isinstance(index, pd.PeriodIndex):
|
||
values = index.asobject.values
|
||
elif isinstance(index, pd.DatetimeIndex) and index.tz is not None:
|
||
values = index
|
||
else:
|
||
values = index.values
|
||
if values.dtype == np.object_:
|
||
values = lib.maybe_convert_objects(values)
|
||
|
||
# if we have the labels, extract the values with a mask
|
||
if labels is not None:
|
||
mask = labels == -1
|
||
|
||
# we can have situations where the whole mask is -1,
|
||
# meaning there is nothing found in labels, so make all nan's
|
||
if mask.all():
|
||
values = np.empty(len(mask))
|
||
values.fill(np.nan)
|
||
else:
|
||
values = values.take(labels)
|
||
if mask.any():
|
||
values, changed = maybe_upcast_putmask(
|
||
values, mask, np.nan)
|
||
return values
|
||
|
||
# We're building a new default index dataframe for use later.
|
||
new_index = pd.RangeIndex(len(self))
|
||
if level is not None:
|
||
if not isinstance(level, (tuple, list)):
|
||
level = [level]
|
||
level = [self.index._get_level_number(lev) for lev in level]
|
||
if isinstance(self.index, pd.MultiIndex):
|
||
if len(level) < self.index.nlevels:
|
||
new_index = self.index.droplevel(level)
|
||
|
||
if not drop:
|
||
if isinstance(self.index, pd.MultiIndex):
|
||
names = [n if n is not None else ('level_%d' % i)
|
||
for (i, n) in enumerate(self.index.names)]
|
||
to_insert = lzip(self.index.levels, self.index.labels)
|
||
else:
|
||
default = 'index'
|
||
i = 0
|
||
while default in self:
|
||
default = 'level_{}'.format(i)
|
||
i += 1
|
||
|
||
names = ([default] if self.index.name is None
|
||
else [self.index.name])
|
||
to_insert = ((self.index, None),)
|
||
|
||
multi_col = isinstance(self.columns, pd.MultiIndex)
|
||
for i, (lev, lab) in reversed(list(enumerate(to_insert))):
|
||
if not (level is None or i in level):
|
||
continue
|
||
name = names[i]
|
||
if multi_col:
|
||
col_name = (list(name) if isinstance(name, tuple)
|
||
else [name])
|
||
if col_fill is None:
|
||
if len(col_name) not in (1, self.columns.nlevels):
|
||
raise ValueError("col_fill=None is incompatible "
|
||
"with incomplete column name "
|
||
"{}".format(name))
|
||
col_fill = col_name[0]
|
||
|
||
lev_num = self.columns._get_level_number(col_level)
|
||
name_lst = [col_fill] * lev_num + col_name
|
||
missing = self.columns.nlevels - len(name_lst)
|
||
name_lst += [col_fill] * missing
|
||
name = tuple(name_lst)
|
||
# to ndarray and maybe infer different dtype
|
||
level_values = _maybe_casted_values(lev, lab)
|
||
new_obj.insert(0, name, level_values)
|
||
|
||
new_obj.index = new_index
|
||
|
||
if not inplace:
|
||
return new_obj
|
||
|
||
def rfloordiv(self, other, axis='columns', level=None, fill_value=None):
|
||
return self._single_df_op_helper(
|
||
lambda df: df.rfloordiv(other, axis, level, fill_value),
|
||
other, axis, level)
|
||
|
||
def rmod(self, other, axis='columns', level=None, fill_value=None):
|
||
return self._single_df_op_helper(
|
||
lambda df: df.rmod(other, axis, level, fill_value),
|
||
other, axis, level)
|
||
|
||
def rmul(self, other, axis='columns', level=None, fill_value=None):
|
||
return self.mul(other, axis, level, fill_value)
|
||
|
||
def rolling(self, window, min_periods=None, freq=None, center=False,
|
||
win_type=None, on=None, axis=0, closed=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def round(self, decimals=0, *args, **kwargs):
|
||
new_block_partitions = np.array([_map_partitions(
|
||
lambda df: df.round(decimals=decimals, *args, **kwargs), block)
|
||
for block in self._block_partitions])
|
||
|
||
return DataFrame(block_partitions=new_block_partitions,
|
||
row_metadata=self._row_metadata,
|
||
col_metadata=self._col_metadata)
|
||
|
||
def rpow(self, other, axis='columns', level=None, fill_value=None):
|
||
return self._single_df_op_helper(
|
||
lambda df: df.rpow(other, axis, level, fill_value),
|
||
other, axis, level)
|
||
|
||
def rsub(self, other, axis='columns', level=None, fill_value=None):
|
||
return self._single_df_op_helper(
|
||
lambda df: df.rsub(other, axis, level, fill_value),
|
||
other, axis, level)
|
||
|
||
def rtruediv(self, other, axis='columns', level=None, fill_value=None):
|
||
return self._single_df_op_helper(
|
||
lambda df: df.rtruediv(other, axis, level, fill_value),
|
||
other, axis, level)
|
||
|
||
def sample(self, n=None, frac=None, replace=False, weights=None,
|
||
random_state=None, axis=None):
|
||
"""Returns a random sample of items from an axis of object.
|
||
|
||
Args:
|
||
n: Number of items from axis to return. Cannot be used with frac.
|
||
Default = 1 if frac = None.
|
||
frac: Fraction of axis items to return. Cannot be used with n.
|
||
replace: Sample with or without replacement. Default = False.
|
||
weights: Default ‘None’ results in equal probability weighting.
|
||
If passed a Series, will align with target object on index.
|
||
Index values in weights not found in sampled object will be
|
||
ignored and index values in sampled object not in weights will
|
||
be assigned weights of zero. If called on a DataFrame, will
|
||
accept the name of a column when axis = 0. Unless weights are
|
||
a Series, weights must be same length as axis being sampled.
|
||
If weights do not sum to 1, they will be normalized to sum
|
||
to 1. Missing values in the weights column will be treated as
|
||
zero. inf and -inf values not allowed.
|
||
random_state: Seed for the random number generator (if int), or
|
||
numpy RandomState object.
|
||
axis: Axis to sample. Accepts axis number or name.
|
||
|
||
Returns:
|
||
A new Dataframe
|
||
"""
|
||
|
||
axis = pd.DataFrame()._get_axis_number(axis) if axis is not None \
|
||
else 0
|
||
|
||
if axis == 0:
|
||
axis_length = len(self._row_metadata)
|
||
else:
|
||
axis_length = len(self._col_metadata)
|
||
|
||
if weights is not None:
|
||
|
||
# Index of the weights Series should correspond to the index of the
|
||
# Dataframe in order to sample
|
||
if isinstance(weights, pd.Series):
|
||
weights = weights.reindex(self.axes[axis])
|
||
|
||
# If weights arg is a string, the weights used for sampling will
|
||
# the be values in the column corresponding to that string
|
||
if isinstance(weights, string_types):
|
||
if axis == 0:
|
||
try:
|
||
weights = self[weights]
|
||
except KeyError:
|
||
raise KeyError("String passed to weights not a "
|
||
"valid column")
|
||
else:
|
||
raise ValueError("Strings can only be passed to "
|
||
"weights when sampling from rows on "
|
||
"a DataFrame")
|
||
|
||
weights = pd.Series(weights, dtype='float64')
|
||
|
||
if len(weights) != axis_length:
|
||
raise ValueError("Weights and axis to be sampled must be of "
|
||
"same length")
|
||
|
||
if (weights == np.inf).any() or (weights == -np.inf).any():
|
||
raise ValueError("weight vector may not include `inf` values")
|
||
|
||
if (weights < 0).any():
|
||
raise ValueError("weight vector many not include negative "
|
||
"values")
|
||
|
||
# weights cannot be NaN when sampling, so we must set all nan
|
||
# values to 0
|
||
weights = weights.fillna(0)
|
||
|
||
# If passed in weights are not equal to 1, renormalize them
|
||
# otherwise numpy sampling function will error
|
||
weights_sum = weights.sum()
|
||
if weights_sum != 1:
|
||
if weights_sum != 0:
|
||
weights = weights / weights_sum
|
||
else:
|
||
raise ValueError("Invalid weights: weights sum to zero")
|
||
|
||
weights = weights.values
|
||
|
||
if n is None and frac is None:
|
||
# default to n = 1 if n and frac are both None (in accordance with
|
||
# Pandas specification)
|
||
n = 1
|
||
elif n is not None and frac is None and n % 1 != 0:
|
||
# n must be an integer
|
||
raise ValueError("Only integers accepted as `n` values")
|
||
elif n is None and frac is not None:
|
||
# compute the number of samples based on frac
|
||
n = int(round(frac * axis_length))
|
||
elif n is not None and frac is not None:
|
||
# Pandas specification does not allow both n and frac to be passed
|
||
# in
|
||
raise ValueError('Please enter a value for `frac` OR `n`, not '
|
||
'both')
|
||
if n < 0:
|
||
raise ValueError("A negative number of rows requested. Please "
|
||
"provide positive value.")
|
||
|
||
if n == 0:
|
||
# An Empty DataFrame is returned if the number of samples is 0.
|
||
# The Empty Dataframe should have either columns or index specified
|
||
# depending on which axis is passed in.
|
||
return DataFrame(columns=[] if axis == 1 else self.columns,
|
||
index=self.index if axis == 1 else [])
|
||
|
||
if axis == 1:
|
||
axis_labels = self.columns
|
||
partition_metadata = self._col_metadata
|
||
partitions = self._col_partitions
|
||
else:
|
||
axis_labels = self.index
|
||
partition_metadata = self._row_metadata
|
||
partitions = self._row_partitions
|
||
|
||
if random_state is not None:
|
||
# Get a random number generator depending on the type of
|
||
# random_state that is passed in
|
||
if isinstance(random_state, int):
|
||
random_num_gen = np.random.RandomState(random_state)
|
||
elif isinstance(random_state, np.random.randomState):
|
||
random_num_gen = random_state
|
||
else:
|
||
# random_state must be an int or a numpy RandomState object
|
||
raise ValueError("Please enter an `int` OR a "
|
||
"np.random.RandomState for random_state")
|
||
|
||
# choose random numbers and then get corresponding labels from
|
||
# chosen axis
|
||
sample_indices = random_num_gen.randint(
|
||
low=0,
|
||
high=len(partition_metadata),
|
||
size=n)
|
||
samples = axis_labels[sample_indices]
|
||
else:
|
||
# randomly select labels from chosen axis
|
||
samples = np.random.choice(a=axis_labels, size=n,
|
||
replace=replace, p=weights)
|
||
|
||
# create an array of (partition, index_within_partition) tuples for
|
||
# each sample
|
||
part_ind_tuples = [partition_metadata[sample]
|
||
for sample in samples]
|
||
|
||
if axis == 1:
|
||
# tup[0] refers to the partition number and tup[1] is the index
|
||
# within that partition
|
||
new_cols = [_deploy_func.remote(lambda df: df.iloc[:, [tup[1]]],
|
||
partitions[tup[0]]) for tup in part_ind_tuples]
|
||
return DataFrame(col_partitions=new_cols,
|
||
columns=samples,
|
||
index=self.index)
|
||
else:
|
||
new_rows = [_deploy_func.remote(lambda df: df.loc[[tup[1]]],
|
||
partitions[tup[0]]) for tup in part_ind_tuples]
|
||
return DataFrame(row_partitions=new_rows,
|
||
columns=self.columns,
|
||
index=samples)
|
||
|
||
def select(self, crit, axis=0):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def select_dtypes(self, include=None, exclude=None):
|
||
# Validates arguments for whether both include and exclude are None or
|
||
# if they are disjoint. Also invalidates string dtypes.
|
||
pd.DataFrame().select_dtypes(include, exclude)
|
||
|
||
if include and not is_list_like(include):
|
||
include = [include]
|
||
elif not include:
|
||
include = []
|
||
|
||
if exclude and not is_list_like(exclude):
|
||
exclude = [exclude]
|
||
elif not exclude:
|
||
exclude = []
|
||
|
||
sel = tuple(map(set, (include, exclude)))
|
||
|
||
include, exclude = map(
|
||
lambda x: set(map(_get_dtype_from_object, x)), sel)
|
||
|
||
include_these = pd.Series(not bool(include), index=self.columns)
|
||
exclude_these = pd.Series(not bool(exclude), index=self.columns)
|
||
|
||
def is_dtype_instance_mapper(column, dtype):
|
||
return column, functools.partial(issubclass, dtype.type)
|
||
|
||
for column, f in itertools.starmap(is_dtype_instance_mapper,
|
||
self.dtypes.iteritems()):
|
||
if include: # checks for the case of empty include or exclude
|
||
include_these[column] = any(map(f, include))
|
||
if exclude:
|
||
exclude_these[column] = not any(map(f, exclude))
|
||
|
||
dtype_indexer = include_these & exclude_these
|
||
indicate = [i for i in range(len(dtype_indexer.values))
|
||
if not dtype_indexer.values[i]]
|
||
return self.drop(columns=self.columns[indicate], inplace=False)
|
||
|
||
def sem(self, axis=None, skipna=None, level=None, ddof=1,
|
||
numeric_only=None, **kwargs):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def set_axis(self, labels, axis=0, inplace=None):
|
||
"""Assign desired index to given axis.
|
||
|
||
Args:
|
||
labels (pd.Index or list-like): The Index to assign.
|
||
axis (string or int): The axis to reassign.
|
||
inplace (bool): Whether to make these modifications inplace.
|
||
|
||
Returns:
|
||
If inplace is False, returns a new DataFrame, otherwise None.
|
||
"""
|
||
if is_scalar(labels):
|
||
warnings.warn(
|
||
'set_axis now takes "labels" as first argument, and '
|
||
'"axis" as named parameter. The old form, with "axis" as '
|
||
'first parameter and \"labels\" as second, is still supported '
|
||
'but will be deprecated in a future version of pandas.',
|
||
FutureWarning, stacklevel=2)
|
||
labels, axis = axis, labels
|
||
|
||
if inplace is None:
|
||
warnings.warn(
|
||
'set_axis currently defaults to operating inplace.\nThis '
|
||
'will change in a future version of pandas, use '
|
||
'inplace=True to avoid this warning.',
|
||
FutureWarning, stacklevel=2)
|
||
inplace = True
|
||
if inplace:
|
||
setattr(self, pd.DataFrame()._get_axis_name(axis), labels)
|
||
else:
|
||
obj = self.copy()
|
||
obj.set_axis(labels, axis=axis, inplace=True)
|
||
return obj
|
||
|
||
def set_index(self, keys, drop=True, append=False, inplace=False,
|
||
verify_integrity=False):
|
||
"""Set the DataFrame index using one or more existing columns.
|
||
|
||
Args:
|
||
keys: column label or list of column labels / arrays.
|
||
drop (boolean): Delete columns to be used as the new index.
|
||
append (boolean): Whether to append columns to existing index.
|
||
inplace (boolean): Modify the DataFrame in place.
|
||
verify_integrity (boolean): Check the new index for duplicates.
|
||
Otherwise defer the check until necessary. Setting to False
|
||
will improve the performance of this method
|
||
|
||
Returns:
|
||
If inplace is set to false returns a new DataFrame, otherwise None.
|
||
"""
|
||
inplace = validate_bool_kwarg(inplace, 'inplace')
|
||
if not isinstance(keys, list):
|
||
keys = [keys]
|
||
|
||
if inplace:
|
||
frame = self
|
||
else:
|
||
frame = self.copy()
|
||
|
||
arrays = []
|
||
names = []
|
||
if append:
|
||
names = [x for x in self.index.names]
|
||
if isinstance(self.index, pd.MultiIndex):
|
||
for i in range(self.index.nlevels):
|
||
arrays.append(self.index._get_level_values(i))
|
||
else:
|
||
arrays.append(self.index)
|
||
|
||
to_remove = []
|
||
for col in keys:
|
||
if isinstance(col, pd.MultiIndex):
|
||
# append all but the last column so we don't have to modify
|
||
# the end of this loop
|
||
for n in range(col.nlevels - 1):
|
||
arrays.append(col._get_level_values(n))
|
||
|
||
level = col._get_level_values(col.nlevels - 1)
|
||
names.extend(col.names)
|
||
elif isinstance(col, pd.Series):
|
||
level = col._values
|
||
names.append(col.name)
|
||
elif isinstance(col, pd.Index):
|
||
level = col
|
||
names.append(col.name)
|
||
elif isinstance(col, (list, np.ndarray, pd.Index)):
|
||
level = col
|
||
names.append(None)
|
||
else:
|
||
level = frame[col]._values
|
||
names.append(col)
|
||
if drop:
|
||
to_remove.append(col)
|
||
arrays.append(level)
|
||
|
||
index = _ensure_index_from_sequences(arrays, names)
|
||
|
||
if verify_integrity and not index.is_unique:
|
||
duplicates = index.get_duplicates()
|
||
raise ValueError('Index has duplicate keys: %s' % duplicates)
|
||
|
||
for c in to_remove:
|
||
del frame[c]
|
||
|
||
# clear up memory usage
|
||
index._cleanup()
|
||
|
||
frame.index = index
|
||
|
||
if not inplace:
|
||
return frame
|
||
|
||
def set_value(self, index, col, value, takeable=False):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def shift(self, periods=1, freq=None, axis=0):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def skew(self, axis=None, skipna=None, level=None, numeric_only=None,
|
||
**kwargs):
|
||
"""Return unbiased skew over requested axis Normalized by N-1
|
||
|
||
Args:
|
||
axis : {index (0), columns (1)}
|
||
skipna : boolean, default True
|
||
Exclude NA/null values when computing the result.
|
||
level : int or level name, default None
|
||
numeric_only : boolean, default None
|
||
|
||
Returns:
|
||
skew : Series or DataFrame (if level specified)
|
||
"""
|
||
def remote_func(df):
|
||
return df.skew(axis=axis, skipna=skipna, level=level,
|
||
numeric_only=numeric_only, **kwargs)
|
||
|
||
return self._arithmetic_helper(remote_func, axis, level)
|
||
|
||
def slice_shift(self, periods=1, axis=0):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
|
||
kind='quicksort', na_position='last', sort_remaining=True,
|
||
by=None):
|
||
"""Sort a DataFrame by one of the indices (columns or index).
|
||
|
||
Args:
|
||
axis: The axis to sort over.
|
||
level: The MultiIndex level to sort over.
|
||
ascending: Ascending or descending
|
||
inplace: Whether or not to update this DataFrame inplace.
|
||
kind: How to perform the sort.
|
||
na_position: Where to position NA on the sort.
|
||
sort_remaining: On Multilevel Index sort based on all levels.
|
||
by: (Deprecated) argument to pass to sort_values.
|
||
|
||
Returns:
|
||
A sorted DataFrame
|
||
"""
|
||
if level is not None:
|
||
raise NotImplementedError("Multilevel index not yet implemented.")
|
||
|
||
if by is not None:
|
||
warnings.warn("by argument to sort_index is deprecated, "
|
||
"please use .sort_values(by=...)",
|
||
FutureWarning, stacklevel=2)
|
||
if level is not None:
|
||
raise ValueError("unable to simultaneously sort by and level")
|
||
return self.sort_values(by, axis=axis, ascending=ascending,
|
||
inplace=inplace)
|
||
|
||
axis = pd.DataFrame()._get_axis_number(axis)
|
||
|
||
args = (axis, level, ascending, False, kind, na_position,
|
||
sort_remaining)
|
||
|
||
def _sort_helper(df, index, axis, *args):
|
||
if axis == 0:
|
||
df.index = index
|
||
else:
|
||
df.columns = index
|
||
|
||
result = df.sort_index(*args)
|
||
df.reset_index(drop=True, inplace=True)
|
||
df.columns = pd.RangeIndex(len(df.columns))
|
||
return result
|
||
|
||
if axis == 0:
|
||
index = self.index
|
||
new_column_parts = _map_partitions(
|
||
lambda df: _sort_helper(df, index, axis, *args),
|
||
self._col_partitions)
|
||
|
||
new_columns = self.columns
|
||
new_index = self.index.sort_values()
|
||
new_row_parts = None
|
||
else:
|
||
columns = self.columns
|
||
new_row_parts = _map_partitions(
|
||
lambda df: _sort_helper(df, columns, axis, *args),
|
||
self._row_partitions)
|
||
|
||
new_columns = self.columns.sort_values()
|
||
new_index = self.index
|
||
new_column_parts = None
|
||
|
||
if not inplace:
|
||
return DataFrame(col_partitions=new_column_parts,
|
||
row_partitions=new_row_parts,
|
||
index=new_index,
|
||
columns=new_columns)
|
||
else:
|
||
self._update_inplace(row_partitions=new_row_parts,
|
||
col_partitions=new_column_parts,
|
||
columns=new_columns,
|
||
index=new_index)
|
||
|
||
def sort_values(self, by, axis=0, ascending=True, inplace=False,
|
||
kind='quicksort', na_position='last'):
|
||
"""Sorts by a column/row or list of columns/rows.
|
||
|
||
Args:
|
||
by: A list of labels for the axis to sort over.
|
||
axis: The axis to sort.
|
||
ascending: Sort in ascending or descending order.
|
||
inplace: If true, do the operation inplace.
|
||
kind: How to sort.
|
||
na_position: Where to put np.nan values.
|
||
|
||
Returns:
|
||
A sorted DataFrame.
|
||
"""
|
||
|
||
axis = pd.DataFrame()._get_axis_number(axis)
|
||
|
||
if not is_list_like(by):
|
||
by = [by]
|
||
|
||
if axis == 0:
|
||
broadcast_value_dict = {str(col): self[col] for col in by}
|
||
broadcast_values = pd.DataFrame(broadcast_value_dict)
|
||
else:
|
||
broadcast_value_list = [to_pandas(self[row::len(self.index)])
|
||
for row in by]
|
||
|
||
index_builder = list(zip(broadcast_value_list, by))
|
||
|
||
for row, idx in index_builder:
|
||
row.index = [str(idx)]
|
||
|
||
broadcast_values = pd.concat([row for row, idx in index_builder])
|
||
|
||
# We are converting the by to string here so that we don't have a
|
||
# collision with the RangeIndex on the inner frame. It is cheap and
|
||
# gaurantees that we sort by the correct column.
|
||
by = [str(col) for col in by]
|
||
|
||
args = (by, axis, ascending, False, kind, na_position)
|
||
|
||
def _sort_helper(df, broadcast_values, axis, *args):
|
||
"""Sorts the data on a partition.
|
||
|
||
Args:
|
||
df: The DataFrame to sort.
|
||
broadcast_values: The by DataFrame to use for the sort.
|
||
axis: The axis to sort over.
|
||
args: The args for the sort.
|
||
|
||
Returns:
|
||
A new sorted DataFrame.
|
||
"""
|
||
if axis == 0:
|
||
broadcast_values.index = df.index
|
||
names = broadcast_values.columns
|
||
else:
|
||
broadcast_values.columns = df.columns
|
||
names = broadcast_values.index
|
||
|
||
return pd.concat([df, broadcast_values], axis=axis ^ 1,
|
||
copy=False).sort_values(*args)\
|
||
.drop(names, axis=axis ^ 1)
|
||
|
||
if axis == 0:
|
||
new_column_partitions = _map_partitions(
|
||
lambda df: _sort_helper(df, broadcast_values, axis, *args),
|
||
self._col_partitions)
|
||
|
||
new_row_partitions = None
|
||
new_columns = self.columns
|
||
|
||
# This is important because it allows us to get the axis that we
|
||
# aren't sorting over. We need the order of the columns/rows and
|
||
# this will provide that in the return value.
|
||
new_index = broadcast_values.sort_values(*args).index
|
||
else:
|
||
new_row_partitions = _map_partitions(
|
||
lambda df: _sort_helper(df, broadcast_values, axis, *args),
|
||
self._row_partitions)
|
||
|
||
new_column_partitions = None
|
||
new_columns = broadcast_values.sort_values(*args).columns
|
||
new_index = self.index
|
||
|
||
if inplace:
|
||
self._update_inplace(row_partitions=new_row_partitions,
|
||
col_partitions=new_column_partitions,
|
||
columns=new_columns,
|
||
index=new_index)
|
||
else:
|
||
return DataFrame(row_partitions=new_row_partitions,
|
||
col_partitions=new_column_partitions,
|
||
columns=new_columns,
|
||
index=new_index)
|
||
|
||
def sortlevel(self, level=0, axis=0, ascending=True, inplace=False,
|
||
sort_remaining=True):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def squeeze(self, axis=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def stack(self, level=-1, dropna=True):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def std(self, axis=None, skipna=None, level=None, ddof=1,
|
||
numeric_only=None, **kwargs):
|
||
"""Computes standard deviation across the DataFrame.
|
||
|
||
Args:
|
||
axis (int): The axis to take the std on.
|
||
skipna (bool): True to skip NA values, false otherwise.
|
||
ddof (int): degrees of freedom
|
||
|
||
Returns:
|
||
The std of the DataFrame (Pandas Series)
|
||
"""
|
||
def remote_func(df):
|
||
return df.std(axis=axis, skipna=skipna, level=level, ddof=ddof,
|
||
numeric_only=numeric_only, **kwargs)
|
||
|
||
return self._arithmetic_helper(remote_func, axis, level)
|
||
|
||
def sub(self, other, axis='columns', level=None, fill_value=None):
|
||
"""Subtract a DataFrame/Series/scalar from this DataFrame.
|
||
|
||
Args:
|
||
other: The object to use to apply the subtraction to this.
|
||
axis: THe axis to apply the subtraction over.
|
||
level: Mutlilevel index level to subtract over.
|
||
fill_value: The value to fill NaNs with.
|
||
|
||
Returns:
|
||
A new DataFrame with the subtraciont applied.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.sub, other, axis, level,
|
||
fill_value)
|
||
|
||
def subtract(self, other, axis='columns', level=None, fill_value=None):
|
||
"""Alias for sub.
|
||
|
||
Args:
|
||
other: The object to use to apply the subtraction to this.
|
||
axis: THe axis to apply the subtraction over.
|
||
level: Mutlilevel index level to subtract over.
|
||
fill_value: The value to fill NaNs with.
|
||
|
||
Returns:
|
||
A new DataFrame with the subtraciont applied.
|
||
"""
|
||
return self.sub(other, axis, level, fill_value)
|
||
|
||
def swapaxes(self, axis1, axis2, copy=True):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def swaplevel(self, i=-2, j=-1, axis=0):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def tail(self, n=5):
|
||
"""Get the last n rows of the dataframe.
|
||
|
||
Args:
|
||
n (int): The number of rows to return.
|
||
|
||
Returns:
|
||
A new dataframe with the last n rows of this dataframe.
|
||
"""
|
||
if n >= len(self._row_metadata):
|
||
return self
|
||
|
||
new_dfs = _map_partitions(lambda df: df.tail(n),
|
||
self._col_partitions)
|
||
|
||
index = self._row_metadata.index[-n:]
|
||
return DataFrame(col_partitions=new_dfs,
|
||
col_metadata=self._col_metadata,
|
||
index=index)
|
||
|
||
def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def to_clipboard(self, excel=None, sep=None, **kwargs):
|
||
|
||
warnings.warn("Defaulting to Pandas implementation",
|
||
PendingDeprecationWarning)
|
||
|
||
port_frame = to_pandas(self)
|
||
port_frame.to_clipboard(excel, sep, **kwargs)
|
||
|
||
def to_csv(self, path_or_buf=None, sep=',', na_rep='', float_format=None,
|
||
columns=None, header=True, index=True, index_label=None,
|
||
mode='w', encoding=None, compression=None, quoting=None,
|
||
quotechar='"', line_terminator='\n', chunksize=None,
|
||
tupleize_cols=None, date_format=None, doublequote=True,
|
||
escapechar=None, decimal='.'):
|
||
|
||
warnings.warn("Defaulting to Pandas implementation",
|
||
PendingDeprecationWarning)
|
||
|
||
port_frame = to_pandas(self)
|
||
port_frame.to_csv(path_or_buf, sep, na_rep, float_format,
|
||
columns, header, index, index_label,
|
||
mode, encoding, compression, quoting,
|
||
quotechar, line_terminator, chunksize,
|
||
tupleize_cols, date_format, doublequote,
|
||
escapechar, decimal)
|
||
|
||
def to_dense(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def to_dict(self, orient='dict', into=dict):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
|
||
float_format=None, columns=None, header=True, index=True,
|
||
index_label=None, startrow=0, startcol=0, engine=None,
|
||
merge_cells=True, encoding=None, inf_rep='inf', verbose=True,
|
||
freeze_panes=None):
|
||
|
||
warnings.warn("Defaulting to Pandas implementation",
|
||
PendingDeprecationWarning)
|
||
|
||
port_frame = to_pandas(self)
|
||
port_frame.to_excel(excel_writer, sheet_name, na_rep,
|
||
float_format, columns, header, index,
|
||
index_label, startrow, startcol, engine,
|
||
merge_cells, encoding, inf_rep, verbose,
|
||
freeze_panes)
|
||
|
||
def to_feather(self, fname):
|
||
|
||
warnings.warn("Defaulting to Pandas implementation",
|
||
PendingDeprecationWarning)
|
||
|
||
port_frame = to_pandas(self)
|
||
port_frame.to_feather(fname)
|
||
|
||
def to_gbq(self, destination_table, project_id, chunksize=10000,
|
||
verbose=True, reauth=False, if_exists='fail',
|
||
private_key=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def to_hdf(self, path_or_buf, key, **kwargs):
|
||
|
||
warnings.warn("Defaulting to Pandas implementation",
|
||
PendingDeprecationWarning)
|
||
|
||
port_frame = to_pandas(self)
|
||
port_frame.to_hdf(path_or_buf, key, **kwargs)
|
||
|
||
def to_html(self, buf=None, columns=None, col_space=None, header=True,
|
||
index=True, na_rep='np.NaN', formatters=None,
|
||
float_format=None, sparsify=None, index_names=True,
|
||
justify=None, bold_rows=True, classes=None, escape=True,
|
||
max_rows=None, max_cols=None, show_dimensions=False,
|
||
notebook=False, decimal='.', border=None):
|
||
|
||
warnings.warn("Defaulting to Pandas implementation",
|
||
PendingDeprecationWarning)
|
||
|
||
port_frame = to_pandas(self)
|
||
port_frame.to_html(buf, columns, col_space, header,
|
||
index, na_rep, formatters,
|
||
float_format, sparsify, index_names,
|
||
justify, bold_rows, classes, escape,
|
||
max_rows, max_cols, show_dimensions,
|
||
notebook, decimal, border)
|
||
|
||
def to_json(self, path_or_buf=None, orient=None, date_format=None,
|
||
double_precision=10, force_ascii=True, date_unit='ms',
|
||
default_handler=None, lines=False, compression=None):
|
||
|
||
warnings.warn("Defaulting to Pandas implementation",
|
||
PendingDeprecationWarning)
|
||
|
||
port_frame = to_pandas(self)
|
||
port_frame.to_json(path_or_buf, orient, date_format,
|
||
double_precision, force_ascii, date_unit,
|
||
default_handler, lines, compression)
|
||
|
||
def to_latex(self, buf=None, columns=None, col_space=None, header=True,
|
||
index=True, na_rep='np.NaN', formatters=None,
|
||
float_format=None, sparsify=None, index_names=True,
|
||
bold_rows=False, column_format=None, longtable=None,
|
||
escape=None, encoding=None, decimal='.', multicolumn=None,
|
||
multicolumn_format=None, multirow=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
|
||
|
||
warnings.warn("Defaulting to Pandas implementation",
|
||
PendingDeprecationWarning)
|
||
|
||
port_frame = to_pandas(self)
|
||
port_frame.to_msgpack(path_or_buf, encoding, **kwargs)
|
||
|
||
def to_panel(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def to_parquet(self, fname, engine='auto', compression='snappy',
|
||
**kwargs):
|
||
|
||
warnings.warn("Defaulting to Pandas implementation",
|
||
PendingDeprecationWarning)
|
||
|
||
port_frame = to_pandas(self)
|
||
port_frame.to_parquet(fname, engine, compression, **kwargs)
|
||
|
||
def to_period(self, freq=None, axis=0, copy=True):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def to_pickle(self, path, compression='infer',
|
||
protocol=pkl.HIGHEST_PROTOCOL):
|
||
|
||
warnings.warn("Defaulting to Pandas implementation",
|
||
PendingDeprecationWarning)
|
||
|
||
port_frame = to_pandas(self)
|
||
port_frame.to_pickle(path, compression, protocol)
|
||
|
||
def to_records(self, index=True, convert_datetime64=True):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def to_sparse(self, fill_value=None, kind='block'):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
|
||
index=True, index_label=None, chunksize=None, dtype=None):
|
||
|
||
warnings.warn("Defaulting to Pandas implementation",
|
||
PendingDeprecationWarning)
|
||
|
||
port_frame = to_pandas(self)
|
||
port_frame.to_sql(name, con, flavor, schema, if_exists,
|
||
index, index_label, chunksize, dtype)
|
||
|
||
def to_stata(self, fname, convert_dates=None, write_index=True,
|
||
encoding='latin-1', byteorder=None, time_stamp=None,
|
||
data_label=None, variable_labels=None):
|
||
|
||
warnings.warn("Defaulting to Pandas implementation",
|
||
PendingDeprecationWarning)
|
||
|
||
port_frame = to_pandas(self)
|
||
port_frame.to_stata(fname, convert_dates, write_index,
|
||
encoding, byteorder, time_stamp,
|
||
data_label, variable_labels)
|
||
|
||
def to_string(self, buf=None, columns=None, col_space=None, header=True,
|
||
index=True, na_rep='np.NaN', formatters=None,
|
||
float_format=None, sparsify=None, index_names=True,
|
||
justify=None, line_width=None, max_rows=None, max_cols=None,
|
||
show_dimensions=False):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def to_xarray(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def transform(self, func, *args, **kwargs):
|
||
kwargs["is_transform"] = True
|
||
result = self.agg(func, *args, **kwargs)
|
||
try:
|
||
result.columns = self.columns
|
||
result.index = self.index
|
||
except ValueError:
|
||
raise ValueError("transforms cannot produce aggregated results")
|
||
return result
|
||
|
||
def truediv(self, other, axis='columns', level=None, fill_value=None):
|
||
"""Divides this DataFrame against another DataFrame/Series/scalar.
|
||
|
||
Args:
|
||
other: The object to use to apply the divide against this.
|
||
axis: The axis to divide over.
|
||
level: The Multilevel index level to apply divide over.
|
||
fill_value: The value to fill NaNs with.
|
||
|
||
Returns:
|
||
A new DataFrame with the Divide applied.
|
||
"""
|
||
return self._operator_helper(pd.DataFrame.truediv, other, axis, level,
|
||
fill_value)
|
||
|
||
def truncate(self, before=None, after=None, axis=None, copy=True):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def tshift(self, periods=1, freq=None, axis=0):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def tz_convert(self, tz, axis=0, level=None, copy=True):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def tz_localize(self, tz, axis=0, level=None, copy=True,
|
||
ambiguous='raise'):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def unstack(self, level=-1, fill_value=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def update(self, other, join='left', overwrite=True, filter_func=None,
|
||
raise_conflict=False):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def var(self, axis=None, skipna=None, level=None, ddof=1,
|
||
numeric_only=None, **kwargs):
|
||
"""Computes variance across the DataFrame.
|
||
|
||
Args:
|
||
axis (int): The axis to take the variance on.
|
||
skipna (bool): True to skip NA values, false otherwise.
|
||
ddof (int): degrees of freedom
|
||
|
||
Returns:
|
||
The variance of the DataFrame.
|
||
"""
|
||
def remote_func(df):
|
||
return df.var(axis=axis, skipna=skipna, level=level, ddof=ddof,
|
||
numeric_only=numeric_only, **kwargs)
|
||
|
||
return self._arithmetic_helper(remote_func, axis, level)
|
||
|
||
def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
|
||
errors='raise', try_cast=False, raise_on_error=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def xs(self, key, axis=0, level=None, drop_level=True):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __getitem__(self, key):
|
||
"""Get the column specified by key for this DataFrame.
|
||
|
||
Args:
|
||
key : The column name.
|
||
|
||
Returns:
|
||
A Pandas Series representing the value for the column.
|
||
"""
|
||
key = com._apply_if_callable(key, self)
|
||
|
||
# shortcut if we are an actual column
|
||
is_mi_columns = isinstance(self.columns, pd.MultiIndex)
|
||
try:
|
||
if key in self.columns and not is_mi_columns:
|
||
return self._getitem_column(key)
|
||
except (KeyError, ValueError, TypeError):
|
||
pass
|
||
|
||
# see if we can slice the rows
|
||
indexer = self._row_metadata.convert_to_index_sliceable(key)
|
||
if indexer is not None:
|
||
return self._getitem_slice(indexer)
|
||
|
||
if isinstance(key, (pd.Series, np.ndarray, pd.Index, list)):
|
||
return self._getitem_array(key)
|
||
elif isinstance(key, DataFrame):
|
||
raise NotImplementedError("To contribute to Pandas on Ray, please"
|
||
"visit github.com/ray-project/ray.")
|
||
# return self._getitem_frame(key)
|
||
elif is_mi_columns:
|
||
raise NotImplementedError("To contribute to Pandas on Ray, please"
|
||
"visit github.com/ray-project/ray.")
|
||
# return self._getitem_multilevel(key)
|
||
else:
|
||
return self._getitem_column(key)
|
||
|
||
def _getitem_column(self, key):
|
||
# may result in multiple columns?
|
||
partition = self._col_metadata[key, 'partition']
|
||
result = ray.get(self._getitem_indiv_col(key, partition))
|
||
result.name = key
|
||
result.index = self.index
|
||
return result
|
||
|
||
def _getitem_array(self, key):
|
||
if com.is_bool_indexer(key):
|
||
if isinstance(key, pd.Series) and \
|
||
not key.index.equals(self.index):
|
||
warnings.warn("Boolean Series key will be reindexed to match "
|
||
"DataFrame index.", UserWarning, stacklevel=3)
|
||
elif len(key) != len(self.index):
|
||
raise ValueError('Item wrong length {} instead of {}.'.format(
|
||
len(key), len(self.index)))
|
||
key = check_bool_indexer(self.index, key)
|
||
|
||
new_parts = _map_partitions(lambda df: df[key],
|
||
self._col_partitions)
|
||
columns = self.columns
|
||
index = self.index[key]
|
||
|
||
return DataFrame(col_partitions=new_parts,
|
||
columns=columns,
|
||
index=index)
|
||
else:
|
||
columns = self._col_metadata[key].index
|
||
indices_for_rows = \
|
||
[i for i, item in enumerate(self.columns)
|
||
if item in set(columns)]
|
||
|
||
new_parts = [_deploy_func.remote(
|
||
lambda df: df.__getitem__(indices_for_rows),
|
||
part) for part in self._row_partitions]
|
||
|
||
index = self.index
|
||
|
||
return DataFrame(row_partitions=new_parts,
|
||
columns=columns,
|
||
index=index)
|
||
|
||
def _getitem_indiv_col(self, key, part):
|
||
loc = self._col_metadata[key]
|
||
if isinstance(loc, pd.Series):
|
||
index = loc[loc['partition'] == part]
|
||
else:
|
||
index = loc[loc['partition'] == part]['index_within_partition']
|
||
return _deploy_func.remote(
|
||
lambda df: df.__getitem__(index),
|
||
self._col_partitions[part])
|
||
|
||
def _getitem_slice(self, key):
|
||
new_cols = _map_partitions(lambda df: df[key],
|
||
self._col_partitions)
|
||
|
||
index = self.index[key]
|
||
return DataFrame(col_partitions=new_cols,
|
||
col_metadata=self._col_metadata,
|
||
index=index)
|
||
|
||
def __getattr__(self, key):
|
||
"""After regular attribute access, looks up the name in the columns
|
||
|
||
Args:
|
||
key (str): Attribute name.
|
||
|
||
Returns:
|
||
The value of the attribute.
|
||
"""
|
||
try:
|
||
return object.__getattribute__(self, key)
|
||
except AttributeError as e:
|
||
if key in self.columns:
|
||
return self[key]
|
||
raise e
|
||
|
||
def __setitem__(self, key, value):
|
||
if not isinstance(key, str):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
if key not in self.columns:
|
||
self.insert(loc=len(self.columns), column=key, value=value)
|
||
else:
|
||
loc = self.columns.get_loc(key)
|
||
self.__delitem__(key)
|
||
self.insert(loc=loc, column=key, value=value)
|
||
|
||
def __len__(self):
|
||
"""Gets the length of the dataframe.
|
||
|
||
Returns:
|
||
Returns an integer length of the dataframe object.
|
||
"""
|
||
return len(self._row_metadata)
|
||
|
||
def __unicode__(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __invert__(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __hash__(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __iter__(self):
|
||
"""Iterate over the columns
|
||
|
||
Returns:
|
||
An Iterator over the columns of the dataframe.
|
||
"""
|
||
return iter(self.columns)
|
||
|
||
def __contains__(self, key):
|
||
"""Searches columns for specific key
|
||
|
||
Args:
|
||
key : The column name
|
||
|
||
Returns:
|
||
Returns a boolean if the specified key exists as a column name
|
||
"""
|
||
return self.columns.__contains__(key)
|
||
|
||
def __nonzero__(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __bool__(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __abs__(self):
|
||
"""Creates a modified DataFrame by taking the absolute value.
|
||
|
||
Returns:
|
||
A modified DataFrame
|
||
"""
|
||
return self.abs()
|
||
|
||
def __round__(self, decimals=0):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __array__(self, dtype=None):
|
||
# TODO: This is very inefficient and needs fix, also see as_matrix
|
||
return to_pandas(self).__array__(dtype=dtype)
|
||
|
||
def __array_wrap__(self, result, context=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __getstate__(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __setstate__(self, state):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __delitem__(self, key):
|
||
"""Delete a column by key. `del a[key]` for example.
|
||
Operation happens in place.
|
||
|
||
Notes: This operation happen on row and column partition
|
||
simultaneously. No rebuild.
|
||
Args:
|
||
key: key to delete
|
||
"""
|
||
# Create helper method for deleting column(s) in row partition.
|
||
def del_helper(df, to_delete):
|
||
cols = df.columns[to_delete] # either int or an array of ints
|
||
|
||
if not is_list_like(cols):
|
||
cols = [cols]
|
||
|
||
for col in cols:
|
||
df.__delitem__(col)
|
||
|
||
# Reset the column index to conserve space
|
||
df.columns = pd.RangeIndex(0, len(df.columns))
|
||
return df
|
||
|
||
to_delete = self.columns.get_loc(key)
|
||
self._row_partitions = _map_partitions(
|
||
del_helper, self._row_partitions, to_delete)
|
||
|
||
# This structure is used to get the correct index inside the partition.
|
||
del_df = self._col_metadata[key]
|
||
|
||
# We need to standardize between multiple and single occurrences in the
|
||
# columns. Putting single occurrences in a pd.DataFrame and transposing
|
||
# results in the same structure as multiple with 'loc'.
|
||
if isinstance(del_df, pd.Series):
|
||
del_df = pd.DataFrame(del_df).T
|
||
|
||
# Cast cols as pd.Series as duplicate columns mean result may be
|
||
# np.int64 or pd.Series
|
||
col_parts_to_del = \
|
||
pd.Series(self._col_metadata[key, 'partition']).unique()
|
||
self._col_metadata.drop(key)
|
||
for i in col_parts_to_del:
|
||
# Compute the correct index inside the partition to delete.
|
||
to_delete_in_partition = \
|
||
del_df[del_df['partition'] == i]['index_within_partition']
|
||
|
||
self._col_partitions[i] = _deploy_func.remote(
|
||
del_helper, self._col_partitions[i], to_delete_in_partition)
|
||
|
||
self._col_metadata.reset_partition_coords(col_parts_to_del)
|
||
|
||
def __finalize__(self, other, method=None, **kwargs):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __copy__(self, deep=True):
|
||
"""Make a copy using Ray.DataFrame.copy method
|
||
|
||
Args:
|
||
deep: Boolean, deep copy or not.
|
||
Currently we do not support deep copy.
|
||
|
||
Returns:
|
||
A Ray DataFrame object.
|
||
"""
|
||
return self.copy(deep=deep)
|
||
|
||
def __deepcopy__(self, memo=None):
|
||
"""Make a -deep- copy using Ray.DataFrame.copy method
|
||
This is equivalent to copy(deep=True).
|
||
|
||
Args:
|
||
memo: No effect. Just to comply with Pandas API.
|
||
|
||
Returns:
|
||
A Ray DataFrame object.
|
||
"""
|
||
return self.copy(deep=True)
|
||
|
||
def __and__(self, other):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __or__(self, other):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __xor__(self, other):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def __lt__(self, other):
|
||
return self.lt(other)
|
||
|
||
def __le__(self, other):
|
||
return self.le(other)
|
||
|
||
def __gt__(self, other):
|
||
return self.gt(other)
|
||
|
||
def __ge__(self, other):
|
||
return self.ge(other)
|
||
|
||
def __eq__(self, other):
|
||
return self.eq(other)
|
||
|
||
def __ne__(self, other):
|
||
return self.ne(other)
|
||
|
||
def __add__(self, other):
|
||
return self.add(other)
|
||
|
||
def __iadd__(self, other):
|
||
return self.add(other)
|
||
|
||
def __radd__(self, other, axis="columns", level=None, fill_value=None):
|
||
return self.radd(other, axis, level, fill_value)
|
||
|
||
def __mul__(self, other):
|
||
return self.mul(other)
|
||
|
||
def __imul__(self, other):
|
||
return self.mul(other)
|
||
|
||
def __rmul__(self, other, axis="columns", level=None, fill_value=None):
|
||
return self.rmul(other, axis, level, fill_value)
|
||
|
||
def __pow__(self, other):
|
||
return self.pow(other)
|
||
|
||
def __ipow__(self, other):
|
||
return self.pow(other)
|
||
|
||
def __rpow__(self, other, axis="columns", level=None, fill_value=None):
|
||
return self.rpow(other, axis, level, fill_value)
|
||
|
||
def __sub__(self, other):
|
||
return self.sub(other)
|
||
|
||
def __isub__(self, other):
|
||
return self.sub(other)
|
||
|
||
def __rsub__(self, other, axis="columns", level=None, fill_value=None):
|
||
return self.rsub(other, axis, level, fill_value)
|
||
|
||
def __floordiv__(self, other):
|
||
return self.floordiv(other)
|
||
|
||
def __ifloordiv__(self, other):
|
||
return self.floordiv(other)
|
||
|
||
def __rfloordiv__(self, other, axis="columns", level=None,
|
||
fill_value=None):
|
||
return self.rfloordiv(other, axis, level, fill_value)
|
||
|
||
def __truediv__(self, other):
|
||
return self.truediv(other)
|
||
|
||
def __itruediv__(self, other):
|
||
return self.truediv(other)
|
||
|
||
def __rtruediv__(self, other, axis="columns", level=None, fill_value=None):
|
||
return self.rtruediv(other, axis, level, fill_value)
|
||
|
||
def __mod__(self, other):
|
||
return self.mod(other)
|
||
|
||
def __imod__(self, other):
|
||
return self.mod(other)
|
||
|
||
def __rmod__(self, other, axis="columns", level=None, fill_value=None):
|
||
return self.rmod(other, axis, level, fill_value)
|
||
|
||
def __div__(self, other, axis="columns", level=None, fill_value=None):
|
||
return self.div(other, axis, level, fill_value)
|
||
|
||
def __rdiv__(self, other, axis="columns", level=None, fill_value=None):
|
||
return self.rdiv(other, axis, level, fill_value)
|
||
|
||
def __neg__(self):
|
||
"""Computes an element wise negative DataFrame
|
||
|
||
Returns:
|
||
A modified DataFrame where every element is the negation of before
|
||
"""
|
||
for t in self.dtypes:
|
||
if not (is_bool_dtype(t)
|
||
or is_numeric_dtype(t)
|
||
or is_timedelta64_dtype(t)):
|
||
raise TypeError("Unary negative expects numeric dtype, not {}"
|
||
.format(t))
|
||
|
||
new_block_partitions = np.array([_map_partitions(
|
||
lambda df: df.__neg__(), block)
|
||
for block in self._block_partitions])
|
||
|
||
return DataFrame(block_partitions=new_block_partitions,
|
||
col_metadata=self._col_metadata,
|
||
row_metadata=self._row_metadata)
|
||
|
||
def __sizeof__(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
@property
|
||
def __doc__(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
@property
|
||
def blocks(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
@property
|
||
def style(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def iat(self, axis=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
@property
|
||
def loc(self):
|
||
"""Purely label-location based indexer for selection by label.
|
||
|
||
We currently support: single label, list array, slice object
|
||
We do not support: boolean array, callable
|
||
"""
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
@property
|
||
def is_copy(self):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def at(self, axis=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def ix(self, axis=None):
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
@property
|
||
def iloc(self):
|
||
"""Purely integer-location based indexing for selection by position.
|
||
|
||
We currently support: single label, list array, slice object
|
||
We do not support: boolean array, callable
|
||
"""
|
||
raise NotImplementedError(
|
||
"To contribute to Pandas on Ray, please visit "
|
||
"github.com/ray-project/ray.")
|
||
|
||
def _copartition(self, other, new_index):
|
||
"""Colocates the values of other with this for certain operations.
|
||
|
||
NOTE: This method uses the indexes of each DataFrame to order them the
|
||
same. This operation does an implicit shuffling of data and zips
|
||
the two DataFrames together to be operated on.
|
||
|
||
Args:
|
||
other: The other DataFrame to copartition with.
|
||
|
||
Returns:
|
||
Two new sets of partitions, copartitioned and zipped.
|
||
"""
|
||
# Put in the object store so they aren't serialized each iteration.
|
||
old_self_index = ray.put(self.index)
|
||
new_index = ray.put(new_index)
|
||
old_other_index = ray.put(other.index)
|
||
|
||
new_num_partitions = max(len(self._block_partitions.T),
|
||
len(other._block_partitions.T))
|
||
|
||
new_partitions_self = \
|
||
np.array([_reindex_helper._submit(
|
||
args=tuple([old_self_index, new_index, 1,
|
||
new_num_partitions] + block.tolist()),
|
||
num_return_vals=new_num_partitions)
|
||
for block in self._block_partitions.T]).T
|
||
|
||
new_partitions_other = \
|
||
np.array([_reindex_helper._submit(
|
||
args=tuple([old_other_index, new_index, 1,
|
||
new_num_partitions] + block.tolist()),
|
||
num_return_vals=new_num_partitions)
|
||
for block in other._block_partitions.T]).T
|
||
|
||
return zip(new_partitions_self, new_partitions_other)
|
||
|
||
def _operator_helper(self, func, other, axis, level, *args):
|
||
"""Helper method for inter-dataframe and scalar operations"""
|
||
if isinstance(other, DataFrame):
|
||
return self._inter_df_op_helper(
|
||
lambda x, y: func(x, y, axis, level, *args),
|
||
other, axis, level)
|
||
else:
|
||
return self._single_df_op_helper(
|
||
lambda df: func(df, other, axis, level, *args),
|
||
other, axis, level)
|
||
|
||
def _inter_df_op_helper(self, func, other, axis, level):
|
||
if level is not None:
|
||
raise NotImplementedError("Mutlilevel index not yet supported "
|
||
"in Pandas on Ray")
|
||
axis = pd.DataFrame()._get_axis_number(axis)
|
||
|
||
# Adding two DataFrames causes an outer join.
|
||
if isinstance(other, DataFrame):
|
||
new_column_index = self.columns.join(other.columns, how="outer")
|
||
new_index = self.index.join(other.index, how="outer")
|
||
copartitions = self._copartition(other, new_index)
|
||
|
||
new_blocks = \
|
||
np.array([_co_op_helper._submit(
|
||
args=tuple([func, self.columns, other.columns,
|
||
len(part[0]), None] +
|
||
np.concatenate(part).tolist()),
|
||
num_return_vals=len(part[0]))
|
||
for part in copartitions])
|
||
|
||
# TODO join the Index Metadata objects together for performance.
|
||
return DataFrame(block_partitions=new_blocks,
|
||
columns=new_column_index,
|
||
index=new_index)
|
||
|
||
def _single_df_op_helper(self, func, other, axis, level):
|
||
if level is not None:
|
||
raise NotImplementedError("Multilevel index not yet supported "
|
||
"in Pandas on Ray")
|
||
axis = pd.DataFrame()._get_axis_number(axis)
|
||
|
||
if is_list_like(other):
|
||
new_index = self.index
|
||
new_column_index = self.columns
|
||
new_col_metadata = self._col_metadata
|
||
new_row_metadata = self._row_metadata
|
||
new_blocks = None
|
||
|
||
if axis == 0:
|
||
if len(other) != len(self.index):
|
||
raise ValueError(
|
||
"Unable to coerce to Series, length must be {0}: "
|
||
"given {1}".format(len(self.index), len(other)))
|
||
new_columns = _map_partitions(func, self._col_partitions)
|
||
new_rows = None
|
||
else:
|
||
if len(other) != len(self.columns):
|
||
raise ValueError(
|
||
"Unable to coerce to Series, length must be {0}: "
|
||
"given {1}".format(len(self.columns), len(other)))
|
||
new_rows = _map_partitions(func, self._row_partitions)
|
||
new_columns = None
|
||
|
||
else:
|
||
new_blocks = np.array([_map_partitions(func, block)
|
||
for block in self._block_partitions])
|
||
new_columns = None
|
||
new_rows = None
|
||
new_index = self.index
|
||
new_column_index = self.columns
|
||
new_col_metadata = self._col_metadata
|
||
new_row_metadata = self._row_metadata
|
||
|
||
return DataFrame(col_partitions=new_columns,
|
||
row_partitions=new_rows,
|
||
block_partitions=new_blocks,
|
||
index=new_index,
|
||
columns=new_column_index,
|
||
col_metadata=new_col_metadata,
|
||
row_metadata=new_row_metadata)
|
||
|
||
|
||
@ray.remote
|
||
def _merge_columns(left_columns, right_columns, *args):
|
||
"""Merge two columns to get the correct column names and order.
|
||
|
||
Args:
|
||
left_columns: The columns on the left side of the merge.
|
||
right_columns: The columns on the right side of the merge.
|
||
args: The arguments for the merge.
|
||
|
||
Returns:
|
||
The columns for the merge operation.
|
||
"""
|
||
return pd.DataFrame(columns=left_columns, index=[0], dtype='uint8').merge(
|
||
pd.DataFrame(columns=right_columns, index=[0], dtype='uint8'),
|
||
*args).columns
|