[DataFrame] Speed up dtypes (#2118)

* Don't recreate _block_partitions in _correct_dtypes Further dtypes performance optimizations Fix bugs Redesign speedup Address feedback * Remove _correct_column_dtypes
2026-06-30 05:57:18 +08:00 · 2018-05-23 16:35:17 -07:00
parent 9ff3d57429
commit 68b11c8251
2 changed files with 14 additions and 31 deletions
@@ -43,8 +43,8 @@ from .utils import (
    _co_op_helper,
    _match_partitioning,
    _concat_index,
-    _correct_column_dtypes,
-    fix_blocks_dimensions)
+    fix_blocks_dimensions,
+    _compile_remote_dtypes)
 from . import get_npartitions
 from .index_metadata import _IndexMetadata
 from .iterator import PartitionIterator
@@ -169,7 +169,7 @@ class DataFrame(object):
                                                index=columns, axis=1)

        if self._dtypes_cache is None:
-            self._correct_dtypes()
+            self._get_remote_dtypes()

    def _get_frame_data(self):
        data = {}
@@ -455,23 +455,11 @@ class DataFrame(object):
        result.index = self.columns
        return result

-    def _correct_dtypes(self):
-        """Corrects dtypes by concatenating column blocks and then splitting them
-        apart back into the original blocks.
-
-        Also caches ObjectIDs for the dtypes of every column.
-
-        Args:
-            block_partitions: arglist of column blocks.
+    def _get_remote_dtypes(self):
+        """Finds and caches ObjectIDs for the dtypes of each column partition.
        """
-        if self._block_partitions.shape[0] > 1:
-            self._block_partitions = np.array(
-                    [_correct_column_dtypes._submit(
-                     args=column, num_return_vals=len(column))
-                     for column in self._block_partitions.T]).T
-
-        self._dtypes_cache = [_deploy_func.remote(lambda df: df.dtypes, pd_df)
-                              for pd_df in self._block_partitions[0]]
+        self._dtypes_cache = [_compile_remote_dtypes.remote(*column)
+                              for column in self._block_partitions.T]

    @property
    def dtypes(self):
@@ -584,7 +572,7 @@ class DataFrame(object):
                self._block_partitions[:, 0], index=index, axis=0)

        # Update dtypes
-        self._correct_dtypes()
+        self._get_remote_dtypes()

    def add_prefix(self, prefix):
        """Add a prefix to each of the column names.
@@ -456,20 +456,15 @@ def _concat_index(*index_parts):
    return index_parts[0].append(index_parts[1:])


-@ray.remote
-def _correct_column_dtypes(*column):
-    """Corrects dtypes of a column by concatenating column partitions and
-    splitting the column back into partitions.
-
-    Args:
-    """
-    concat_column = pd.concat(column, copy=False)
-    return create_blocks_helper(concat_column, len(column), 1)
-
-
 def fix_blocks_dimensions(blocks, axis):
    """Checks that blocks is 2D, and adds a dimension if not.
    """
    if blocks.ndim < 2:
        return np.expand_dims(blocks, axis=axis ^ 1)
    return blocks
+
+
+@ray.remote
+def _compile_remote_dtypes(*column_of_blocks):
+    small_dfs = [df.loc[0:0] for df in column_of_blocks]
+    return pd.concat(small_dfs, copy=False).dtypes