[DataFrame] Adding insert, set_axis, set_index, reset_index and tests (#1603)

2026-06-28 17:50:55 +08:00 · 2018-02-26 08:58:15 -08:00
parent c2ad800cbf
commit 1fa59f1887
2 changed files with 384 additions and 30 deletions
@@ -3,6 +3,14 @@ from __future__ import division
 from __future__ import print_function

 import pandas as pd
+from pandas.api.types import is_scalar
+from pandas.util._validators import validate_bool_kwarg
+from pandas.core.index import _ensure_index_from_sequences
+from pandas._libs import lib
+from pandas.core.dtypes.cast import maybe_upcast_putmask
+from pandas.compat import lzip
+
+import warnings
 import numpy as np
 import ray
 import itertools
@@ -792,7 +800,52 @@ class DataFrame(object):
        raise NotImplementedError("Not Yet implemented.")

    def insert(self, loc, column, value, allow_duplicates=False):
-        raise NotImplementedError("Not Yet implemented.")
+        """Insert column into DataFrame at specified location.
+
+        Args:
+            loc (int): Insertion index. Must verify 0 <= loc <= len(columns).
+            column (hashable object): Label of the inserted column.
+            value (int, Series, or array-like): The values to insert.
+            allow_duplicates (bool): Whether to allow duplicate column names.
+        """
+        try:
+            len(value)
+        except TypeError:
+            value = [value for _ in range(len(self.index))]
+
+        if len(value) != len(self.index):
+            raise ValueError(
+                "Column length provided does not match DataFrame length.")
+        if loc < 0 or loc > len(self.columns):
+            raise ValueError(
+                "Location provided must be higher than 0 and lower than the "
+                "number of columns.")
+        if not allow_duplicates and column in self.columns:
+            raise ValueError(
+                "Column {} already exists in DataFrame.".format(column))
+
+        cumulative = np.cumsum(self._lengths)
+        partitions = [value[cumulative[i-1]:cumulative[i]]
+                      for i in range(len(cumulative))
+                      if i != 0]
+
+        partitions.insert(0, value[:cumulative[0]])
+
+        # Because insert is always inplace, we have to create this temp fn.
+        def _insert(_df, _loc, _column, _part, _allow_duplicates):
+            _df.insert(_loc, _column, _part, _allow_duplicates)
+            return _df
+
+        self._df = \
+            [_deploy_func.remote(_insert,
+                                 self._df[i],
+                                 loc,
+                                 column,
+                                 partitions[i],
+                                 allow_duplicates)
+             for i in range(len(self._df))]
+
+        self.columns = self.columns.insert(loc, column)

    def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
                    limit_direction='forward', downcast=None, **kwargs):
@@ -1047,6 +1100,8 @@ class DataFrame(object):
        popped = to_pandas(self._map_partitions(
            lambda df: df.pop(item)))
        self._df = self._map_partitions(lambda df: df.drop([item], axis=1))._df
+        self.columns = [col for col in self.columns if col != item]
+
        return popped

    def pow(self, other, axis='columns', level=None, fill_value=None):
@@ -1111,7 +1166,103 @@ class DataFrame(object):

    def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
                    col_fill=''):
-        raise NotImplementedError("Not Yet implemented.")
+        """Reset this index to default and create column from current index.
+
+        Args:
+            level: Only remove the given levels from the index. Removes all
+                levels by default
+            drop: Do not try to insert index into dataframe columns. This
+                resets the index to the default integer index.
+            inplace: Modify the DataFrame in place (do not create a new object)
+            col_level : If the columns have multiple levels, determines which
+                level the labels are inserted into. By default it is inserted
+                into the first level.
+            col_fill: If the columns have multiple levels, determines how the
+                other levels are named. If None then the index name is
+                repeated.
+
+        Returns:
+            A new DataFrame if inplace is False, None otherwise.
+        """
+        inplace = validate_bool_kwarg(inplace, 'inplace')
+        if inplace:
+            new_obj = self
+        else:
+            new_obj = self.copy()
+
+        def _maybe_casted_values(index, labels=None):
+            if isinstance(index, pd.PeriodIndex):
+                values = index.asobject.values
+            elif isinstance(index, pd.DatetimeIndex) and index.tz is not None:
+                values = index
+            else:
+                values = index.values
+                if values.dtype == np.object_:
+                    values = lib.maybe_convert_objects(values)
+
+            # if we have the labels, extract the values with a mask
+            if labels is not None:
+                mask = labels == -1
+
+                # we can have situations where the whole mask is -1,
+                # meaning there is nothing found in labels, so make all nan's
+                if mask.all():
+                    values = np.empty(len(mask))
+                    values.fill(np.nan)
+                else:
+                    values = values.take(labels)
+                    if mask.any():
+                        values, changed = maybe_upcast_putmask(
+                            values, mask, np.nan)
+            return values
+
+        new_index = new_obj._default_index().index
+        if level is not None:
+            if not isinstance(level, (tuple, list)):
+                level = [level]
+            level = [self.index._get_level_number(lev) for lev in level]
+            if isinstance(self.index, pd.MultiIndex):
+                if len(level) < self.index.nlevels:
+                    new_index = self.index.droplevel(level)
+
+        if not drop:
+            if isinstance(self.index, pd.MultiIndex):
+                names = [n if n is not None else ('level_%d' % i)
+                         for (i, n) in enumerate(self.index.names)]
+                to_insert = lzip(self.index.levels, self.index.labels)
+            else:
+                default = 'index' if 'index' not in self else 'level_0'
+                names = ([default] if self.index.name is None
+                         else [self.index.name])
+                to_insert = ((self.index, None),)
+
+            multi_col = isinstance(self.columns, pd.MultiIndex)
+            for i, (lev, lab) in reversed(list(enumerate(to_insert))):
+                if not (level is None or i in level):
+                    continue
+                name = names[i]
+                if multi_col:
+                    col_name = (list(name) if isinstance(name, tuple)
+                                else [name])
+                    if col_fill is None:
+                        if len(col_name) not in (1, self.columns.nlevels):
+                            raise ValueError("col_fill=None is incompatible "
+                                             "with incomplete column name "
+                                             "{}".format(name))
+                        col_fill = col_name[0]
+
+                    lev_num = self.columns._get_level_number(col_level)
+                    name_lst = [col_fill] * lev_num + col_name
+                    missing = self.columns.nlevels - len(name_lst)
+                    name_lst += [col_fill] * missing
+                    name = tuple(name_lst)
+                # to ndarray and maybe infer different dtype
+                level_values = _maybe_casted_values(lev, lab)
+                new_obj.insert(0, name, level_values)
+
+        new_obj.index = new_index
+        if not inplace:
+            return new_obj

    def rfloordiv(self, other, axis='columns', level=None, fill_value=None):
        raise NotImplementedError("Not Yet implemented.")
@@ -1155,11 +1306,116 @@ class DataFrame(object):
        raise NotImplementedError("Not Yet implemented.")

    def set_axis(self, labels, axis=0, inplace=None):
-        raise NotImplementedError("Not Yet implemented.")
+        """Assign desired index to given axis.
+
+        Args:
+            labels (pd.Index or list-like): The Index to assign.
+            axis (string or int): The axis to reassign.
+            inplace (bool): Whether to make these modifications inplace.
+
+        Returns:
+            If inplace is False, returns a new DataFrame, otherwise None.
+        """
+        if is_scalar(labels):
+            warnings.warn(
+                'set_axis now takes "labels" as first argument, and '
+                '"axis" as named parameter. The old form, with "axis" as '
+                'first parameter and \"labels\" as second, is still supported '
+                'but will be deprecated in a future version of pandas.',
+                FutureWarning, stacklevel=2)
+            labels, axis = axis, labels
+
+        if inplace is None:
+            warnings.warn(
+                'set_axis currently defaults to operating inplace.\nThis '
+                'will change in a future version of pandas, use '
+                'inplace=True to avoid this warning.',
+                FutureWarning, stacklevel=2)
+            inplace = True
+        if inplace:
+            setattr(self, self._index._get_axis_name(axis), labels)
+        else:
+            obj = self.copy()
+            obj.set_axis(labels, axis=axis, inplace=True)
+            return obj

    def set_index(self, keys, drop=True, append=False, inplace=False,
                  verify_integrity=False):
-        raise NotImplementedError("Not Yet implemented.")
+        """Set the DataFrame index using one or more existing columns.
+
+        Args:
+            keys: column label or list of column labels / arrays.
+            drop (boolean): Delete columns to be used as the new index.
+            append (boolean): Whether to append columns to existing index.
+            inplace (boolean): Modify the DataFrame in place.
+            verify_integrity (boolean): Check the new index for duplicates.
+                Otherwise defer the check until necessary. Setting to False
+                will improve the performance of this method
+
+        Returns:
+            If inplace is set to false returns a new DataFrame, otherwise None.
+        """
+        inplace = validate_bool_kwarg(inplace, 'inplace')
+        if not isinstance(keys, list):
+            keys = [keys]
+
+        if inplace:
+            frame = self
+        else:
+            frame = self.copy()
+
+        arrays = []
+        names = []
+        if append:
+            names = [x for x in self.index.names]
+            if isinstance(self.index, pd.MultiIndex):
+                for i in range(self.index.nlevels):
+                    arrays.append(self.index._get_level_values(i))
+            else:
+                arrays.append(self.index)
+
+        to_remove = []
+        for col in keys:
+            if isinstance(col, pd.MultiIndex):
+                # append all but the last column so we don't have to modify
+                # the end of this loop
+                for n in range(col.nlevels - 1):
+                    arrays.append(col._get_level_values(n))
+
+                level = col._get_level_values(col.nlevels - 1)
+                names.extend(col.names)
+            elif isinstance(col, pd.Series):
+                level = col._values
+                names.append(col.name)
+            elif isinstance(col, pd.Index):
+                level = col
+                names.append(col.name)
+            elif isinstance(col, (list, np.ndarray, pd.Index)):
+                level = col
+                names.append(None)
+            else:
+                level = frame[col]._values
+                names.append(col)
+                if drop:
+                    to_remove.append(col)
+            arrays.append(level)
+
+        index = _ensure_index_from_sequences(arrays, names)
+
+        if verify_integrity and not index.is_unique:
+            duplicates = index.get_duplicates()
+            raise ValueError('Index has duplicate keys: %s' % duplicates)
+
+        for c in to_remove:
+            del frame[c]
+
+        # clear up memory usage
+        index._cleanup()
+
+        frame.index = index
+
+        if not inplace:
+            return frame

    def set_value(self, index, col, value, takeable=False):
        raise NotImplementedError("Not Yet implemented.")
@@ -1416,7 +1672,7 @@ class DataFrame(object):
        raise NotImplementedError("Not Yet implemented.")

    def __contains__(self, key):
-        raise NotImplementedError("Not Yet implemented.")
+        return key in self.columns

    def __nonzero__(self):
        raise NotImplementedError("Not Yet implemented.")
@@ -1715,4 +1971,7 @@ def to_pandas(df):
    Returns:
        A new pandas DataFrame.
    """
-    return pd.concat(ray.get(df._df))
+    pd_df = pd.concat(ray.get(df._df))
+    pd_df.index = df.index
+    pd_df.columns = df.columns
+    return pd_df
@@ -216,6 +216,29 @@ def test_int_dataframe():
    test_notna(ray_df, pandas_df)
    test_notnull(ray_df, pandas_df)

+    labels = ['a', 'b', 'c', 'd']
+    test_set_axis(ray_df, pandas_df, labels, 0)
+    test_set_axis(ray_df, pandas_df, labels, 'rows')
+    labels.append('e')
+    test_set_axis(ray_df, pandas_df, labels, 1)
+    test_set_axis(ray_df, pandas_df, labels, 'columns')
+
+    for key in keys:
+        test_set_index(ray_df, pandas_df, key)
+
+    test_reset_index(ray_df, pandas_df)
+    test_reset_index(ray_df, pandas_df, inplace=True)
+
+    for key in keys:
+        test___contains__(ray_df, key, True)
+    test___contains__(ray_df, "Not Exists", False)
+
+    for key in keys:
+        test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key])
+        test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key])
+        test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
+        test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
+

 def test_float_dataframe():

@@ -288,6 +311,26 @@ def test_float_dataframe():
    test_iteritems(ray_df, pandas_df)
    test_itertuples(ray_df, pandas_df)

+    labels = ['a', 'b', 'c', 'd']
+    test_set_axis(ray_df, pandas_df, labels, 0)
+    test_set_axis(ray_df, pandas_df, labels, 'rows')
+    labels.append('e')
+    test_set_axis(ray_df, pandas_df, labels, 1)
+    test_set_axis(ray_df, pandas_df, labels, 'columns')
+
+    for key in keys:
+        test_set_index(ray_df, pandas_df, key)
+        test_set_index(ray_df, pandas_df, key, inplace=True)
+
+    test_reset_index(ray_df, pandas_df)
+    test_reset_index(ray_df, pandas_df, inplace=True)
+
+    for key in keys:
+        test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key])
+        test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key])
+        test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
+        test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
+

 def test_mixed_dtype_dataframe():
    pandas_df = pd.DataFrame({
@@ -365,6 +408,25 @@ def test_mixed_dtype_dataframe():
    test_iteritems(ray_df, pandas_df)
    test_itertuples(ray_df, pandas_df)

+    labels = ['a', 'b', 'c', 'd']
+    test_set_axis(ray_df, pandas_df, labels, 0)
+    test_set_axis(ray_df, pandas_df, labels, 'rows')
+    test_set_axis(ray_df, pandas_df, labels, 1)
+    test_set_axis(ray_df, pandas_df, labels, 'columns')
+
+    for key in keys:
+        test_set_index(ray_df, pandas_df, key)
+        test_set_index(ray_df, pandas_df, key, inplace=True)
+
+    test_reset_index(ray_df, pandas_df)
+    test_reset_index(ray_df, pandas_df, inplace=True)
+
+    for key in keys:
+        test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key])
+        test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key])
+        test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
+        test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
+

 def test_nan_dataframe():
    pandas_df = pd.DataFrame({
@@ -435,6 +497,25 @@ def test_nan_dataframe():
    test_iteritems(ray_df, pandas_df)
    test_itertuples(ray_df, pandas_df)

+    labels = ['a', 'b', 'c', 'd']
+    test_set_axis(ray_df, pandas_df, labels, 0)
+    test_set_axis(ray_df, pandas_df, labels, 'rows')
+    test_set_axis(ray_df, pandas_df, labels, 1)
+    test_set_axis(ray_df, pandas_df, labels, 'columns')
+
+    for key in keys:
+        test_set_index(ray_df, pandas_df, key)
+        test_set_index(ray_df, pandas_df, key, inplace=True)
+
+    test_reset_index(ray_df, pandas_df)
+    test_reset_index(ray_df, pandas_df, inplace=True)
+
+    for key in keys:
+        test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key])
+        test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key])
+        test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
+        test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
+

 def test_add():
    ray_df = create_test_dataframe()
@@ -902,11 +983,13 @@ def test_info():
        ray_df.info()


-def test_insert():
-    ray_df = create_test_dataframe()
+@pytest.fixture
+def test_insert(ray_df, pandas_df, loc, column, value):
+    ray_df_cp = ray_df.copy()
+    pd_df_cp = pandas_df.copy()

-    with pytest.raises(NotImplementedError):
-        ray_df.insert(None, None, None)
+    ray_df_cp.insert(loc, column, value)
+    pd_df_cp.insert(loc, column, value)


 def test_interpolate():
@@ -1307,11 +1390,19 @@ def test_resample():
        ray_df.resample(None)


-def test_reset_index():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.reset_index()
+@pytest.fixture
+def test_reset_index(ray_df, pandas_df, inplace=False):
+    if not inplace:
+        print(rdf.to_pandas(ray_df.reset_index(inplace=inplace)).index)
+        print(pandas_df.reset_index(inplace=inplace))
+        assert rdf.to_pandas(ray_df.reset_index(inplace=inplace)).equals(
+            pandas_df.reset_index(inplace=inplace))
+    else:
+        ray_df_cp = ray_df.copy()
+        pd_df_cp = pandas_df.copy()
+        ray_df_cp.reset_index(inplace=inplace)
+        pd_df_cp.reset_index(inplace=inplace)
+        assert rdf.to_pandas(ray_df_cp).equals(pd_df_cp)


 def test_rfloordiv():
@@ -1397,18 +1488,23 @@ def test_sem():
        ray_df.sem()


-def test_set_axis():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.set_axis(None)
+@pytest.fixture
+def test_set_axis(ray_df, pandas_df, label, axis):
+    assert rdf.to_pandas(ray_df.set_axis(label, axis, inplace=False)).equals(
+        pandas_df.set_axis(label, axis, inplace=False))


-def test_set_index():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.set_index(None)
+@pytest.fixture
+def test_set_index(ray_df, pandas_df, keys, inplace=False):
+    if not inplace:
+        assert rdf.to_pandas(ray_df.set_index(keys)).equals(
+            pandas_df.set_index(keys))
+    else:
+        ray_df_cp = ray_df.copy()
+        pd_df_cp = pandas_df.copy()
+        ray_df_cp.set_index(keys, inplace=inplace)
+        pd_df_cp.set_index(keys, inplace=inplace)
+        assert rdf.to_pandas(ray_df_cp).equals(pd_df_cp)


 def test_set_value():
@@ -1817,11 +1913,10 @@ def test___iter__():
        ray_df.__iter__()


-def test___contains__():
-    ray_df = create_test_dataframe()
-
-    with pytest.raises(NotImplementedError):
-        ray_df.__contains__(None)
+@pytest.fixture
+def test___contains__(ray_df, key, result):
+    assert result == ray_df.__contains__(key)
+    assert result == (key in ray_df)


 def test___nonzero__():