[DataFrame] Refactor __delitem__ (#2080)

* Implement the bug fix * Fix flake8
2026-07-06 01:07:38 +08:00 · 2018-05-18 11:58:20 -07:00
parent 25e7aa1e79
commit 0b07602c89
3 changed files with 24 additions and 12 deletions
@@ -4904,10 +4904,6 @@ class DataFrame(object):
            df.columns = pd.RangeIndex(0, len(df.columns))
            return df

-        to_delete = self.columns.get_loc(key)
-        self._row_partitions = _map_partitions(
-            del_helper, self._row_partitions, to_delete)
-
        # This structure is used to get the correct index inside the partition.
        del_df = self._col_metadata[key]

@@ -4920,15 +4916,18 @@ class DataFrame(object):
        # Cast cols as pd.Series as duplicate columns mean result may be
        # np.int64 or pd.Series
        col_parts_to_del = \
-            pd.Series(self._col_metadata[key, 'partition']).unique()
+            pd.Series(del_df['partition'].copy()).unique()
        self._col_metadata.drop(key)
+
        for i in col_parts_to_del:
            # Compute the correct index inside the partition to delete.
            to_delete_in_partition = \
                del_df[del_df['partition'] == i]['index_within_partition']

-            self._col_partitions[i] = _deploy_func.remote(
-                del_helper, self._col_partitions[i], to_delete_in_partition)
+            for j in range(self._block_partitions.shape[0]):
+                self._block_partitions[j, i] = _deploy_func.remote(
+                    del_helper, self._block_partitions[j, i],
+                    to_delete_in_partition)

        self._col_metadata.reset_partition_coords(col_parts_to_del)

@@ -203,15 +203,15 @@ class _IndexMetadata(object):
            # updated as well.
            try:
                self._coord_df.loc[partition_mask,
-                                   'index_within_partition'] = [
-                    p for p in range(sum(partition_mask))]
+                                   'index_within_partition'] = np.arange(
+                                       sum(partition_mask)).astype(int)
            except ValueError:
                # Copy the arrow sealed dataframe so we can mutate it.
                # We only do this the first time we try to mutate the sealed.
                self._coord_df = self._coord_df.copy()
                self._coord_df.loc[partition_mask,
-                                   'index_within_partition'] = [
-                    p for p in range(sum(partition_mask))]
+                                   'index_within_partition'] = np.arange(
+                                       sum(partition_mask)).astype(int)

    def insert(self, key, loc=None, partition=None,
               index_within_partition=None):
@@ -357,7 +357,14 @@ class _IndexMetadata(object):

        # Update first lengths to prevent possible length inconsistencies
        if isinstance(dropped, pd.DataFrame):
-            drop_per_part = dropped.groupby(["partition"]).size()\
+            try:
+                drop_per_part = dropped.groupby(["partition"]).size()\
+                        .reindex(index=pd.RangeIndex(len(self._lengths)),
+                                 fill_value=0)
+            except ValueError:
+                # Copy the arrow sealed dataframe so we can mutate it.
+                dropped = dropped.copy()
+                drop_per_part = dropped.groupby(["partition"]).size()\
                    .reindex(index=pd.RangeIndex(len(self._lengths)),
                             fill_value=0)
        elif isinstance(dropped, pd.Series):
@@ -3274,6 +3274,12 @@ def test___delitem__(ray_df, pd_df):
    pd_df.__delitem__('col1')
    ray_df_equals_pandas(ray_df, pd_df)

+    # Issue 2027
+    last_label = pd_df.iloc[:, -1].name
+    ray_df.__delitem__(last_label)
+    pd_df.__delitem__(last_label)
+    ray_df_equals_pandas(ray_df, pd_df)
+

 def test___finalize__():
    ray_df = create_test_dataframe()