[DataFrame] Speed up dtypes (#2118)

* Don't recreate _block_partitions in _correct_dtypes

Further dtypes performance optimizations

Fix bugs

Redesign speedup

Address feedback

* Remove _correct_column_dtypes
This commit is contained in:
Peter Schafhalter
2018-05-23 16:35:17 -07:00
committed by Devin Petersohn
parent 9ff3d57429
commit 68b11c8251
2 changed files with 14 additions and 31 deletions
+8 -20
View File
@@ -43,8 +43,8 @@ from .utils import (
_co_op_helper,
_match_partitioning,
_concat_index,
_correct_column_dtypes,
fix_blocks_dimensions)
fix_blocks_dimensions,
_compile_remote_dtypes)
from . import get_npartitions
from .index_metadata import _IndexMetadata
from .iterator import PartitionIterator
@@ -169,7 +169,7 @@ class DataFrame(object):
index=columns, axis=1)
if self._dtypes_cache is None:
self._correct_dtypes()
self._get_remote_dtypes()
def _get_frame_data(self):
data = {}
@@ -455,23 +455,11 @@ class DataFrame(object):
result.index = self.columns
return result
def _correct_dtypes(self):
"""Corrects dtypes by concatenating column blocks and then splitting them
apart back into the original blocks.
Also caches ObjectIDs for the dtypes of every column.
Args:
block_partitions: arglist of column blocks.
def _get_remote_dtypes(self):
"""Finds and caches ObjectIDs for the dtypes of each column partition.
"""
if self._block_partitions.shape[0] > 1:
self._block_partitions = np.array(
[_correct_column_dtypes._submit(
args=column, num_return_vals=len(column))
for column in self._block_partitions.T]).T
self._dtypes_cache = [_deploy_func.remote(lambda df: df.dtypes, pd_df)
for pd_df in self._block_partitions[0]]
self._dtypes_cache = [_compile_remote_dtypes.remote(*column)
for column in self._block_partitions.T]
@property
def dtypes(self):
@@ -584,7 +572,7 @@ class DataFrame(object):
self._block_partitions[:, 0], index=index, axis=0)
# Update dtypes
self._correct_dtypes()
self._get_remote_dtypes()
def add_prefix(self, prefix):
"""Add a prefix to each of the column names.
+6 -11
View File
@@ -456,20 +456,15 @@ def _concat_index(*index_parts):
return index_parts[0].append(index_parts[1:])
@ray.remote
def _correct_column_dtypes(*column):
"""Corrects dtypes of a column by concatenating column partitions and
splitting the column back into partitions.
Args:
"""
concat_column = pd.concat(column, copy=False)
return create_blocks_helper(concat_column, len(column), 1)
def fix_blocks_dimensions(blocks, axis):
"""Checks that blocks is 2D, and adds a dimension if not.
"""
if blocks.ndim < 2:
return np.expand_dims(blocks, axis=axis ^ 1)
return blocks
@ray.remote
def _compile_remote_dtypes(*column_of_blocks):
small_dfs = [df.loc[0:0] for df in column_of_blocks]
return pd.concat(small_dfs, copy=False).dtypes