From 6455ec934b102353837e7d0537c25dc89f0cd79b Mon Sep 17 00:00:00 2001 From: Peter Veerman Date: Mon, 12 Mar 2018 19:05:32 -0700 Subject: [PATCH] [DataFrame] Implements DataFrame.rename, DataFrame.rename_axis, and Index.set_names (#1573) * Index update * Fixed transpose bug with nan values * Fix lint * Add rename tests * Implement DataFrame.rename, DataFrame.rename_axis, and Index.set_names * Temp * Fixing rename for new index implementation Fix rebase merges * Fix rename and rename_axis to work with new index. Re-add pytest fixture Clean up rebase artifacts Remove index.py file * Addressing minor points * Addressing comments --- python/ray/dataframe/dataframe.py | 77 +++++- python/ray/dataframe/test/test_dataframe.py | 253 +++++++++++++++++++- 2 files changed, 317 insertions(+), 13 deletions(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index 8f0d9c47b..a0de0db01 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -1742,14 +1742,79 @@ class DataFrame(object): def rename(self, mapper=None, index=None, columns=None, axis=None, copy=True, inplace=False, level=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + if mapper is None and index is None and columns is None: + raise TypeError('must pass an index to rename') + + if axis is None: + if columns is not None: + new_df = [ + _deploy_func.remote( + lambda df: df.rename(columns=columns, + copy=copy, level=level), + part + ) + for part in self._df + ] + new_columns = pd.DataFrame(columns=self.columns)\ + .rename(columns=columns, copy=copy, level=level)\ + .columns + new_df = DataFrame(new_df, new_columns, self.index) + else: + new_df = self.copy() + if index is not None: + new_df.index = self._index.rename(index=index, copy=copy, + level=level).index + else: + new_df = self._map_partitions( + lambda df: df.rename(mapper=mapper, axis=axis, copy=copy, + level=level) + ) + new_df._index = new_df._index.rename(mapper=mapper, axis=axis, + copy=copy, level=level) + new_df.columns = pd.DataFrame(columns=new_df.columns)\ + .rename(mapper=mapper, axis=axis, copy=copy, + level=level).columns + + if inplace: + self._update_inplace( + df=new_df._df, + columns=new_df.columns, + index=new_df.index + ) + else: + return new_df def rename_axis(self, mapper, axis=0, copy=True, inplace=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + axes_is_columns = axis == 1 or axis == "columns" + renamed = self if inplace else self.copy() + if axes_is_columns: + renamed.columns.name = mapper + else: + renamed._index.rename_axis(mapper, axis=axis, copy=copy, + inplace=True) + if not inplace: + return renamed + + def _set_axis_name(self, name, axis=0, inplace=False): + """Alter the name or names of the axis. + + Args: + name: Name for the Index, or list of names for the MultiIndex + axis: 0 or 'index' for the index; 1 or 'columns' for the columns + inplace: Whether to modify `self` directly or return a copy + + Returns: + Type of caller or None if inplace=True. + """ + axes_is_columns = axis == 1 or axis == "columns" + renamed = self if inplace else self.copy() + if axes_is_columns: + renamed.columns.set_names(name) + else: + renamed._index.set_names(name) + + if not inplace: + return renamed def reorder_levels(self, order, axis=0): raise NotImplementedError( diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index e38063db3..ec6041b60 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -1960,18 +1960,257 @@ def test_reindex_like(): ray_df.reindex_like(None) -def test_rename(): - ray_df = create_test_dataframe() +# Renaming - with pytest.raises(NotImplementedError): - ray_df.rename() +def test_rename(): + test_rename_sanity() + test_rename_multiindex() + # TODO: Uncomment when __setitem__ is implemented + # test_rename_nocopy() + test_rename_inplace() + test_rename_bug() + + +@pytest.fixture +def test_rename_sanity(num_partitions=2): + test_data = TestData() + mapping = { + 'A': 'a', + 'B': 'b', + 'C': 'c', + 'D': 'd' + } + + ray_df = rdf.from_pandas(test_data.frame, num_partitions) + assert ray_df_equals_pandas( + ray_df.rename(columns=mapping), + test_data.frame.rename(columns=mapping) + ) + + renamed2 = test_data.frame.rename(columns=str.lower) + assert ray_df_equals_pandas( + ray_df.rename(columns=str.lower), + renamed2 + ) + + ray_df = rdf.from_pandas(renamed2, num_partitions) + assert ray_df_equals_pandas( + ray_df.rename(columns=str.upper), + renamed2.rename(columns=str.upper) + ) + + # index + data = { + 'A': {'foo': 0, 'bar': 1} + } + + # gets sorted alphabetical + df = pd.DataFrame(data) + ray_df = rdf.from_pandas(df, num_partitions) + tm.assert_index_equal( + ray_df.rename(index={'foo': 'bar', 'bar': 'foo'}).index, + df.rename(index={'foo': 'bar', 'bar': 'foo'}).index + ) + + tm.assert_index_equal( + ray_df.rename(index=str.upper).index, + df.rename(index=str.upper).index + ) + + # have to pass something + pytest.raises(TypeError, ray_df.rename) + + # partial columns + renamed = test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'}) + ray_df = rdf.from_pandas(test_data.frame, num_partitions) + tm.assert_index_equal( + ray_df.rename(columns={'C': 'foo', 'D': 'bar'}).index, + test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'}).index + ) + + # TODO: Uncomment when transpose works + # other axis + # renamed = test_data.frame.T.rename(index={'C': 'foo', 'D': 'bar'}) + # tm.assert_index_equal( + # test_data.frame.T.rename(index={'C': 'foo', 'D': 'bar'}).index, + # ray_df.T.rename(index={'C': 'foo', 'D': 'bar'}).index + # ) + + # index with name + index = pd.Index(['foo', 'bar'], name='name') + renamer = pd.DataFrame(data, index=index) + + ray_df = rdf.from_pandas(renamer, num_partitions) + renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'}) + ray_renamed = ray_df.rename(index={'foo': 'bar', 'bar': 'foo'}) + tm.assert_index_equal( + renamed.index, ray_renamed.index + ) + + assert renamed.index.name == ray_renamed.index.name + + +@pytest.fixture +def test_rename_multiindex(num_partitions=2): + tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')] + tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')] + index = pd.MultiIndex.from_tuples(tuples_index, names=['foo', 'bar']) + columns = pd.MultiIndex.from_tuples( + tuples_columns, names=['fizz', 'buzz']) + df = pd.DataFrame([(0, 0), (1, 1)], index=index, columns=columns) + ray_df = rdf.from_pandas(df, num_partitions) + + # + # without specifying level -> accross all levels + renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, + columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) + ray_renamed = ray_df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, + columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) + tm.assert_index_equal( + renamed.index, ray_renamed.index + ) + + renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, + columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) + tm.assert_index_equal(renamed.columns, ray_renamed.columns) + assert renamed.index.names == ray_renamed.index.names + assert renamed.columns.names == ray_renamed.columns.names + + # + # with specifying a level (GH13766) + + # dict + renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, + level=0) + ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, + level=0) + tm.assert_index_equal(renamed.columns, ray_renamed.columns) + renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, + level='fizz') + ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, + level='fizz') + tm.assert_index_equal(renamed.columns, ray_renamed.columns) + + renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, + level=1) + ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, + level=1) + tm.assert_index_equal(renamed.columns, ray_renamed.columns) + renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, + level='buzz') + ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, + level='buzz') + tm.assert_index_equal(renamed.columns, ray_renamed.columns) + + # function + func = str.upper + renamed = df.rename(columns=func, level=0) + ray_renamed = ray_df.rename(columns=func, level=0) + tm.assert_index_equal(renamed.columns, ray_renamed.columns) + renamed = df.rename(columns=func, level='fizz') + ray_renamed = ray_df.rename(columns=func, level='fizz') + tm.assert_index_equal(renamed.columns, ray_renamed.columns) + + renamed = df.rename(columns=func, level=1) + ray_renamed = ray_df.rename(columns=func, level=1) + tm.assert_index_equal(renamed.columns, ray_renamed.columns) + renamed = df.rename(columns=func, level='buzz') + ray_renamed = ray_df.rename(columns=func, level='buzz') + tm.assert_index_equal(renamed.columns, ray_renamed.columns) + + # index + renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, + level=0) + ray_renamed = ray_df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, + level=0) + tm.assert_index_equal(ray_renamed.index, renamed.index) + + +@pytest.fixture +def test_rename_nocopy(num_partitions=2): + test_data = TestData().frame + ray_df = rdf.from_pandas(test_data, num_partitions) + ray_renamed = ray_df.rename(columns={'C': 'foo'}, copy=False) + ray_renamed['foo'] = 1 + assert (ray_df['C'] == 1).all() + + +@pytest.fixture +def test_rename_inplace(num_partitions=2): + test_data = TestData().frame + ray_df = rdf.from_pandas(test_data, num_partitions) + + assert ray_df_equals_pandas( + ray_df.rename(columns={'C': 'foo'}), + test_data.rename(columns={'C': 'foo'}) + ) + + frame = test_data.copy() + ray_frame = ray_df.copy() + frame.rename(columns={'C': 'foo'}, inplace=True) + ray_frame.rename(columns={'C': 'foo'}, inplace=True) + + assert ray_df_equals_pandas( + ray_frame, + frame + ) + + +@pytest.fixture +def test_rename_bug(num_partitions=2): + # GH 5344 + # rename set ref_locs, and set_index was not resetting + df = pd.DataFrame({0: ['foo', 'bar'], 1: ['bah', 'bas'], 2: [1, 2]}) + ray_df = rdf.from_pandas(df, num_partitions) + df = df.rename(columns={0: 'a'}) + df = df.rename(columns={1: 'b'}) + # TODO: Uncomment when set_index is implemented + # df = df.set_index(['a', 'b']) + # df.columns = ['2001-01-01'] + + ray_df = ray_df.rename(columns={0: 'a'}) + ray_df = ray_df.rename(columns={1: 'b'}) + # TODO: Uncomment when set_index is implemented + # ray_df = ray_df.set_index(['a', 'b']) + # ray_df.columns = ['2001-01-01'] + + assert ray_df_equals_pandas( + ray_df, + df + ) def test_rename_axis(): - ray_df = create_test_dataframe() + test_rename_axis_inplace() - with pytest.raises(NotImplementedError): - ray_df.rename_axis(None) + +@pytest.fixture +def test_rename_axis_inplace(num_partitions=2): + test_frame = TestData().frame + ray_df = rdf.from_pandas(test_frame, num_partitions) + + # GH 15704 + result = test_frame.copy() + ray_result = ray_df.copy() + no_return = result.rename_axis('foo', inplace=True) + ray_no_return = ray_result.rename_axis('foo', inplace=True) + + assert no_return is ray_no_return + assert ray_df_equals_pandas( + ray_result, + result + ) + + result = test_frame.copy() + ray_result = ray_df.copy() + no_return = result.rename_axis('bar', axis=1, inplace=True) + ray_no_return = ray_result.rename_axis('bar', axis=1, inplace=True) + + assert no_return is ray_no_return + assert ray_df_equals_pandas( + ray_result, + result + ) def test_reorder_levels():