mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 23:09:51 +08:00
[DataFrame] Fix transpose with nan values and add functionality needed for Index (#1545)
This commit is contained in:
committed by
Robert Nishihara
parent
db4a920bdb
commit
de6fa02c85
@@ -7,10 +7,12 @@ import numpy as np
|
||||
import ray
|
||||
import itertools
|
||||
|
||||
from .index import Index
|
||||
|
||||
|
||||
class DataFrame(object):
|
||||
|
||||
def __init__(self, df, columns):
|
||||
def __init__(self, df, columns, index=None):
|
||||
"""Distributed DataFrame object backed by Pandas dataframes.
|
||||
|
||||
Args:
|
||||
@@ -22,29 +24,53 @@ class DataFrame(object):
|
||||
assert(len(df) > 0)
|
||||
|
||||
self._df = df
|
||||
# TODO: Clean up later.
|
||||
# We will call get only when we access the object (and only once).
|
||||
self._lengths = \
|
||||
ray.get([_deploy_func.remote(_get_lengths, d) for d in self._df])
|
||||
self.columns = columns
|
||||
|
||||
if index is None:
|
||||
self._index = self._default_index()
|
||||
else:
|
||||
self._index = index
|
||||
|
||||
self._pd_index = None
|
||||
|
||||
def __str__(self):
|
||||
return "ray.DataFrame object"
|
||||
|
||||
def __repr__(self):
|
||||
return "ray.DataFrame object"
|
||||
|
||||
@property
|
||||
def index(self):
|
||||
def _get_index(self):
|
||||
"""Get the index for this DataFrame.
|
||||
|
||||
Returns:
|
||||
The union of all indexes across the partitions.
|
||||
"""
|
||||
indices = ray.get(self._map_partitions(lambda df: df.index)._df)
|
||||
if isinstance(indices[0], pd.RangeIndex):
|
||||
merged = indices[0]
|
||||
for index in indices[1:]:
|
||||
merged = merged.union(index)
|
||||
return merged
|
||||
else:
|
||||
return indices[0].append(indices[1:])
|
||||
if self._pd_index is None:
|
||||
self._pd_index = Index.to_pandas(self._index)
|
||||
|
||||
return self._pd_index
|
||||
|
||||
def _set_index(self, new_index):
|
||||
"""Set the index for this DataFrame.
|
||||
|
||||
Args:
|
||||
new_index: The new index to set this
|
||||
"""
|
||||
self._pd_index = None
|
||||
self._index = Index.from_pandas(new_index, self._lengths)
|
||||
|
||||
def _default_index(self):
|
||||
dest_indices = [(i, j)
|
||||
for i in range(len(self._lengths))
|
||||
for j in range(self._lengths[i])]
|
||||
return Index({i: dest_indices[i] for i in range(len(dest_indices))},
|
||||
pd.RangeIndex)
|
||||
|
||||
index = property(_get_index, _set_index)
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
@@ -140,7 +166,7 @@ class DataFrame(object):
|
||||
assert(callable(func))
|
||||
new_df = [_deploy_func.remote(func, part) for part in self._df]
|
||||
|
||||
return DataFrame(new_df, self.columns)
|
||||
return DataFrame(new_df, self.columns, index=self._index)
|
||||
|
||||
def add_prefix(self, prefix):
|
||||
"""Add a prefix to each of the column names.
|
||||
@@ -150,7 +176,7 @@ class DataFrame(object):
|
||||
"""
|
||||
new_dfs = self._map_partitions(lambda df: df.add_prefix(prefix))
|
||||
new_cols = self.columns.map(lambda x: str(prefix) + str(x))
|
||||
return DataFrame(new_dfs._df, new_cols)
|
||||
return DataFrame(new_dfs._df, new_cols, index=self._index)
|
||||
|
||||
def add_suffix(self, suffix):
|
||||
"""Add a suffix to each of the column names.
|
||||
@@ -160,7 +186,7 @@ class DataFrame(object):
|
||||
"""
|
||||
new_dfs = self._map_partitions(lambda df: df.add_suffix(suffix))
|
||||
new_cols = self.columns.map(lambda x: str(x) + str(suffix))
|
||||
return DataFrame(new_dfs._df, new_cols)
|
||||
return DataFrame(new_dfs._df, new_cols, index=self._index)
|
||||
|
||||
def applymap(self, func):
|
||||
"""Apply a function to a DataFrame elementwise.
|
||||
@@ -177,7 +203,7 @@ class DataFrame(object):
|
||||
Returns:
|
||||
A new DataFrame pointing to the same partitions as this one.
|
||||
"""
|
||||
return DataFrame(self._df, self.columns)
|
||||
return DataFrame(self._df, self.columns, index=self._index)
|
||||
|
||||
def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
|
||||
group_keys=True, squeeze=False, **kwargs):
|
||||
@@ -199,11 +225,8 @@ class DataFrame(object):
|
||||
[index for df in ray.get(self._df) for index in list(df.index)]))
|
||||
|
||||
chunksize = int(len(indices) / len(self._df))
|
||||
partitions = []
|
||||
|
||||
for df in self._df:
|
||||
partitions.append(_shuffle.remote(df, indices, chunksize))
|
||||
|
||||
partitions = [_shuffle.remote(df, indices, chunksize)
|
||||
for df in self._df]
|
||||
partitions = ray.get(partitions)
|
||||
|
||||
# Transpose the list of dataframes
|
||||
@@ -213,7 +236,6 @@ class DataFrame(object):
|
||||
shuffle.append([])
|
||||
for j in range(len(partitions)):
|
||||
shuffle[i].append(partitions[j][i])
|
||||
|
||||
new_dfs = [_local_groupby.remote(part, axis=axis) for part in shuffle]
|
||||
|
||||
return DataFrame(new_dfs, self.columns)
|
||||
@@ -311,8 +333,10 @@ class DataFrame(object):
|
||||
"""
|
||||
local_transpose = self._map_partitions(
|
||||
lambda df: df.transpose(*args, **kwargs))
|
||||
|
||||
# Sum will collapse the NAs from the groupby
|
||||
return local_transpose.reduce_by_index(lambda df: df.sum(), axis=1)
|
||||
return local_transpose.reduce_by_index(
|
||||
lambda df: df.apply(lambda x: x), axis=1)
|
||||
|
||||
T = property(transpose)
|
||||
|
||||
@@ -1502,6 +1526,24 @@ class DataFrame(object):
|
||||
raise NotImplementedError("Not Yet implemented.")
|
||||
|
||||
|
||||
def _get_lengths(df):
|
||||
"""Gets the length of the dataframe.
|
||||
|
||||
Args:
|
||||
df: A remote pd.DataFrame object.
|
||||
|
||||
Returns:
|
||||
Returns an integer length of the dataframe object. If the attempt
|
||||
fails, returns 0 as the length.
|
||||
"""
|
||||
try:
|
||||
return len(df)
|
||||
# Because we sometimes have cases where we have summary statistics in our
|
||||
# DataFrames
|
||||
except TypeError:
|
||||
return 0
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _shuffle(df, indices, chunksize):
|
||||
"""Shuffle data by sending it through the Ray Store.
|
||||
@@ -1518,12 +1560,12 @@ def _shuffle(df, indices, chunksize):
|
||||
i = 0
|
||||
partition = []
|
||||
while len(indices) > chunksize:
|
||||
oids = df.reindex(indices[:chunksize]).dropna()
|
||||
oids = df.reindex(indices[:chunksize])
|
||||
partition.append(oids)
|
||||
indices = indices[chunksize:]
|
||||
i += 1
|
||||
else:
|
||||
oids = df.reindex(indices).dropna()
|
||||
oids = df.reindex(indices)
|
||||
partition.append(oids)
|
||||
return partition
|
||||
|
||||
@@ -1581,16 +1623,27 @@ def from_pandas(df, npartitions=None, chunksize=None, sort=True):
|
||||
elif chunksize is None:
|
||||
raise ValueError("The number of partitions or chunksize must be set.")
|
||||
|
||||
old_index = df.index
|
||||
|
||||
# TODO stop reassigning df
|
||||
dataframes = []
|
||||
lengths = []
|
||||
while len(df) > chunksize:
|
||||
top = ray.put(df[:chunksize])
|
||||
t_df = df[:chunksize]
|
||||
lengths.append(len(t_df))
|
||||
# reindex here because we want a pd.RangeIndex within the partitions.
|
||||
# It is smaller and sometimes faster.
|
||||
t_df.reindex()
|
||||
top = ray.put(t_df)
|
||||
dataframes.append(top)
|
||||
df = df[chunksize:]
|
||||
else:
|
||||
dataframes.append(ray.put(df))
|
||||
lengths.append(len(df))
|
||||
|
||||
return DataFrame(dataframes, df.columns)
|
||||
ray_index = Index.from_pandas(old_index, lengths)
|
||||
|
||||
return DataFrame(dataframes, df.columns, index=ray_index)
|
||||
|
||||
|
||||
def to_pandas(df):
|
||||
|
||||
@@ -7,15 +7,50 @@ import pandas as pd
|
||||
|
||||
class Index(object):
|
||||
|
||||
def __init__(self, idx):
|
||||
def __init__(self, idx, pandas_type):
|
||||
self.idx = idx
|
||||
self.pandas_type = pandas_type
|
||||
|
||||
def __getitem__(self, item):
|
||||
return self.idx[item]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.idx)
|
||||
|
||||
@classmethod
|
||||
def to_pandas(indices):
|
||||
if isinstance(indices[0], pd.RangeIndex):
|
||||
merged = indices[0]
|
||||
for index in indices[1:]:
|
||||
merged = merged.union(index)
|
||||
return merged
|
||||
def to_pandas(cls, index):
|
||||
"""Convert a Ray Index object to a Pandas Index object.
|
||||
|
||||
Args:
|
||||
index (ray.Index): A Ray Index object.
|
||||
|
||||
Returns:
|
||||
A pandas Index object.
|
||||
"""
|
||||
k = index.idx.keys()
|
||||
if index.pandas_type is pd.RangeIndex:
|
||||
return pd.RangeIndex(min(k), max(k) + 1)
|
||||
else:
|
||||
return indices[0].append(indices[1:])
|
||||
return pd.Index(k)
|
||||
|
||||
@classmethod
|
||||
def from_pandas(cls, pd_index, lengths):
|
||||
"""Convert a Pandas Index object to a Ray Index object.
|
||||
|
||||
Args:
|
||||
pd_index (pd.Index): A Pandas Index object.
|
||||
lengths ([int]): A list of lengths for the partitions.
|
||||
|
||||
Returns:
|
||||
A Ray Index object.
|
||||
"""
|
||||
dest_indices = [(i, j)
|
||||
for i in range(len(lengths))
|
||||
for j in range(lengths[i])]
|
||||
if len(pd_index) != len(dest_indices):
|
||||
raise ValueError(
|
||||
"Length of index given does not match current dataframe")
|
||||
|
||||
return Index(
|
||||
{pd_index[i]: dest_indices[i] for i in range(len(dest_indices))},
|
||||
type(pd_index))
|
||||
|
||||
@@ -22,6 +22,12 @@ def test_roundtrip(ray_df, pandas_df):
|
||||
@pytest.fixture
|
||||
def test_index(ray_df, pandas_df):
|
||||
assert(ray_df.index.equals(pandas_df.index))
|
||||
ray_df_cp = ray_df.copy()
|
||||
pandas_df_cp = pandas_df.copy()
|
||||
|
||||
ray_df_cp.index = [str(i) for i in ray_df_cp.index]
|
||||
pandas_df_cp.index = [str(i) for i in pandas_df_cp.index]
|
||||
assert(ray_df_cp.index.sort_values().equals(pandas_df_cp.index))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -41,10 +47,7 @@ def test_ftypes(ray_df, pandas_df):
|
||||
|
||||
@pytest.fixture
|
||||
def test_values(ray_df, pandas_df):
|
||||
a = np.ndarray.flatten(ray_df.values)
|
||||
b = np.ndarray.flatten(pandas_df.values)
|
||||
for c, d in zip(a, b):
|
||||
assert(c == d or (np.isnan(c) and np.isnan(d)))
|
||||
np.testing.assert_equal(ray_df.values, pandas_df.values)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -339,6 +342,51 @@ def test_mixed_dtype_dataframe():
|
||||
test_notnull(ray_df, pandas_df)
|
||||
|
||||
|
||||
def test_nan_dataframe():
|
||||
pandas_df = pd.DataFrame({
|
||||
'col1': [1, 2, 3, np.nan],
|
||||
'col2': [4, 5, np.nan, 7],
|
||||
'col3': [8, np.nan, 10, 11],
|
||||
'col4': [np.nan, 13, 14, 15]})
|
||||
|
||||
ray_df = rdf.from_pandas(pandas_df, 2)
|
||||
|
||||
testfuncs = [lambda x: x + x,
|
||||
lambda x: str(x),
|
||||
lambda x: x,
|
||||
lambda x: False]
|
||||
|
||||
keys = ['col1',
|
||||
'col2',
|
||||
'col3',
|
||||
'col4']
|
||||
|
||||
test_roundtrip(ray_df, pandas_df)
|
||||
test_index(ray_df, pandas_df)
|
||||
test_size(ray_df, pandas_df)
|
||||
test_ndim(ray_df, pandas_df)
|
||||
test_ftypes(ray_df, pandas_df)
|
||||
test_values(ray_df, pandas_df)
|
||||
test_axes(ray_df, pandas_df)
|
||||
test_shape(ray_df, pandas_df)
|
||||
test_add_prefix(ray_df, pandas_df)
|
||||
test_add_suffix(ray_df, pandas_df)
|
||||
|
||||
for testfunc in testfuncs:
|
||||
test_applymap(ray_df, pandas_df, testfunc)
|
||||
|
||||
test_copy(ray_df)
|
||||
test_sum(ray_df, pandas_df)
|
||||
test_keys(ray_df, pandas_df)
|
||||
test_transpose(ray_df, pandas_df)
|
||||
|
||||
for key in keys:
|
||||
test_get(ray_df, pandas_df, key)
|
||||
|
||||
test_get_dtype_counts(ray_df, pandas_df)
|
||||
test_get_ftype_counts(ray_df, pandas_df)
|
||||
|
||||
|
||||
def test_add():
|
||||
ray_df = create_test_dataframe()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user