mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 05:25:28 +08:00
Adding support for concat (#1739)
adding tests fixing flake8 adding init flake 8 on test fixing tests, imports, and flake8 handling for index adding tests for row, index added more robust error handling for axis fixing test failures cleaning up error sfor 2.7 updating travis resolving import fixing flake8 moved import order Fixing to refactor and delaying implementing ray-pd inner concat resolving ray-pd concat and from_pandas mutation Revert "resolving ray-pd concat and from_pandas mutation" This reverts commit 5db43e4e89e328286532f3ef98a4526575c5d08d.
This commit is contained in:
committed by
Devin Petersohn
parent
3039cca242
commit
efeaacbedc
@@ -30,9 +30,10 @@ def get_npartitions():
|
||||
from .dataframe import DataFrame # noqa: 402
|
||||
from .series import Series # noqa: 402
|
||||
from .io import (read_csv, read_parquet) # noqa: 402
|
||||
from .concat import concat # noqa: 402
|
||||
|
||||
__all__ = [
|
||||
"DataFrame", "Series", "read_csv", "read_parquet"
|
||||
"DataFrame", "Series", "read_csv", "read_parquet", "concat"
|
||||
]
|
||||
|
||||
try:
|
||||
|
||||
@@ -0,0 +1,90 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from .dataframe import DataFrame as rdf
|
||||
from .utils import (
|
||||
from_pandas,
|
||||
_deploy_func)
|
||||
from functools import reduce
|
||||
|
||||
|
||||
def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
|
||||
keys=None, levels=None, names=None, verify_integrity=False,
|
||||
copy=True):
|
||||
|
||||
def _concat(frame1, frame2):
|
||||
# Check type on objects
|
||||
# Case 1: Both are Pandas DF
|
||||
if isinstance(frame1, pd.DataFrame) and \
|
||||
isinstance(frame2, pd.DataFrame):
|
||||
|
||||
return pd.concat((frame1, frame2), axis, join, join_axes,
|
||||
ignore_index, keys, levels, names,
|
||||
verify_integrity, copy)
|
||||
|
||||
if not (isinstance(frame1, rdf) and
|
||||
isinstance(frame2, rdf)) and join == 'inner':
|
||||
raise NotImplementedError(
|
||||
"Obj as dicts not implemented. To contribute to "
|
||||
"Pandas on Ray, please visit github.com/ray-project/ray."
|
||||
)
|
||||
|
||||
# Case 2: Both are different types
|
||||
if isinstance(frame1, pd.DataFrame):
|
||||
frame1 = from_pandas(frame1, len(frame1) / 2**16 + 1)
|
||||
if isinstance(frame2, pd.DataFrame):
|
||||
frame2 = from_pandas(frame2, len(frame2) / 2**16 + 1)
|
||||
|
||||
# Case 3: Both are Ray DF
|
||||
if isinstance(frame1, rdf) and \
|
||||
isinstance(frame2, rdf):
|
||||
|
||||
new_columns = frame1.columns.join(frame2.columns, how=join)
|
||||
|
||||
def _reindex_helper(pdf, old_columns, join):
|
||||
pdf.columns = old_columns
|
||||
if join == 'outer':
|
||||
pdf = pdf.reindex(columns=new_columns)
|
||||
else:
|
||||
pdf = pdf[new_columns]
|
||||
pdf.columns = pd.RangeIndex(len(new_columns))
|
||||
|
||||
return pdf
|
||||
|
||||
f1_columns, f2_columns = frame1.columns, frame2.columns
|
||||
new_f1 = [_deploy_func.remote(lambda p: _reindex_helper(p,
|
||||
f1_columns, join), part) for
|
||||
part in frame1._row_partitions]
|
||||
new_f2 = [_deploy_func.remote(lambda p: _reindex_helper(p,
|
||||
f2_columns, join), part) for
|
||||
part in frame2._row_partitions]
|
||||
|
||||
return rdf(row_partitions=new_f1 + new_f2, columns=new_columns,
|
||||
index=frame1.index.append(frame2.index))
|
||||
|
||||
# (TODO) Group all the pandas dataframes
|
||||
|
||||
if isinstance(objs, dict):
|
||||
raise NotImplementedError(
|
||||
"Obj as dicts not implemented. To contribute to "
|
||||
"Pandas on Ray, please visit github.com/ray-project/ray."
|
||||
)
|
||||
|
||||
axis = pd.DataFrame()._get_axis_number(axis)
|
||||
if axis == 1:
|
||||
raise NotImplementedError(
|
||||
"Concat not implemented for axis=1. To contribute to "
|
||||
"Pandas on Ray, please visit github.com/ray-project/ray."
|
||||
)
|
||||
|
||||
all_pd = np.all([isinstance(obj, pd.DataFrame) for obj in objs])
|
||||
if all_pd:
|
||||
result = pd.concat(objs, axis, join, join_axes,
|
||||
ignore_index, keys, levels, names,
|
||||
verify_integrity, copy)
|
||||
else:
|
||||
result = reduce(_concat, objs)
|
||||
|
||||
if isinstance(result, pd.DataFrame):
|
||||
return from_pandas(result, len(result) / 2**16 + 1)
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,107 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
import pandas as pd
|
||||
import ray.dataframe as rdf
|
||||
from ray.dataframe.utils import (
|
||||
to_pandas,
|
||||
from_pandas
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_df_equals_pandas(ray_df, pandas_df):
|
||||
return to_pandas(ray_df).sort_index().equals(pandas_df.sort_index())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_df_equals(ray_df1, ray_df2):
|
||||
return to_pandas(ray_df1).sort_index().equals(
|
||||
to_pandas(ray_df2).sort_index()
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def generate_dfs():
|
||||
df = pd.DataFrame({'col1': [0, 1, 2, 3],
|
||||
'col2': [4, 5, 6, 7],
|
||||
'col3': [8, 9, 10, 11],
|
||||
'col4': [12, 13, 14, 15],
|
||||
'col5': [0, 0, 0, 0]})
|
||||
|
||||
df2 = pd.DataFrame({'col1': [0, 1, 2, 3],
|
||||
'col2': [4, 5, 6, 7],
|
||||
'col3': [8, 9, 10, 11],
|
||||
'col6': [12, 13, 14, 15],
|
||||
'col7': [0, 0, 0, 0]})
|
||||
return df, df2
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_df_concat():
|
||||
df, df2 = generate_dfs()
|
||||
|
||||
assert(ray_df_equals_pandas(rdf.concat([df, df2]), pd.concat([df, df2])))
|
||||
|
||||
|
||||
def test_ray_concat():
|
||||
df, df2 = generate_dfs()
|
||||
ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
|
||||
|
||||
assert(ray_df_equals_pandas(rdf.concat([ray_df, ray_df2]),
|
||||
pd.concat([df, df2])))
|
||||
|
||||
|
||||
def test_ray_concat_on_index():
|
||||
df, df2 = generate_dfs()
|
||||
ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
|
||||
|
||||
assert(ray_df_equals_pandas(rdf.concat([ray_df, ray_df2], axis='index'),
|
||||
pd.concat([df, df2], axis='index')))
|
||||
|
||||
assert(ray_df_equals_pandas(rdf.concat([ray_df, ray_df2], axis='rows'),
|
||||
pd.concat([df, df2], axis='rows')))
|
||||
|
||||
assert(ray_df_equals_pandas(rdf.concat([ray_df, ray_df2], axis=0),
|
||||
pd.concat([df, df2], axis=0)))
|
||||
|
||||
|
||||
def test_ray_concat_on_column():
|
||||
df, df2 = generate_dfs()
|
||||
ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
rdf.concat([ray_df, ray_df2], axis=1)
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
rdf.concat([ray_df, ray_df2], axis="columns")
|
||||
|
||||
|
||||
def test_invalid_axis_errors():
|
||||
df, df2 = generate_dfs()
|
||||
ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
rdf.concat([ray_df, ray_df2], axis=2)
|
||||
|
||||
|
||||
def test_mixed_concat():
|
||||
df, df2 = generate_dfs()
|
||||
df3 = df.copy()
|
||||
|
||||
mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]
|
||||
|
||||
assert(ray_df_equals_pandas(rdf.concat(mixed_dfs),
|
||||
pd.concat([df, df2, df3])))
|
||||
|
||||
|
||||
def test_mixed_inner_concat():
|
||||
df, df2 = generate_dfs()
|
||||
df3 = df.copy()
|
||||
|
||||
mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
rdf.concat(mixed_dfs, join="inner")
|
||||
@@ -5,10 +5,11 @@ from __future__ import print_function
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import ray.dataframe as rdf
|
||||
import ray.dataframe.io as io
|
||||
import os
|
||||
|
||||
from ray.dataframe.utils import to_pandas
|
||||
|
||||
TEST_PARQUET_FILENAME = 'test.parquet'
|
||||
TEST_CSV_FILENAME = 'test.csv'
|
||||
SMALL_ROW_SIZE = 2000
|
||||
@@ -17,7 +18,7 @@ LARGE_ROW_SIZE = 7e6
|
||||
|
||||
@pytest.fixture
|
||||
def ray_df_equals_pandas(ray_df, pandas_df):
|
||||
return rdf.to_pandas(ray_df).sort_index().equals(pandas_df.sort_index())
|
||||
return to_pandas(ray_df).sort_index().equals(pandas_df.sort_index())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
||||
Reference in New Issue
Block a user