mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 15:43:55 +08:00
[DataFrame] Speed up dtypes (#2118)
* Don't recreate _block_partitions in _correct_dtypes Further dtypes performance optimizations Fix bugs Redesign speedup Address feedback * Remove _correct_column_dtypes
This commit is contained in:
committed by
Devin Petersohn
parent
9ff3d57429
commit
68b11c8251
@@ -43,8 +43,8 @@ from .utils import (
|
||||
_co_op_helper,
|
||||
_match_partitioning,
|
||||
_concat_index,
|
||||
_correct_column_dtypes,
|
||||
fix_blocks_dimensions)
|
||||
fix_blocks_dimensions,
|
||||
_compile_remote_dtypes)
|
||||
from . import get_npartitions
|
||||
from .index_metadata import _IndexMetadata
|
||||
from .iterator import PartitionIterator
|
||||
@@ -169,7 +169,7 @@ class DataFrame(object):
|
||||
index=columns, axis=1)
|
||||
|
||||
if self._dtypes_cache is None:
|
||||
self._correct_dtypes()
|
||||
self._get_remote_dtypes()
|
||||
|
||||
def _get_frame_data(self):
|
||||
data = {}
|
||||
@@ -455,23 +455,11 @@ class DataFrame(object):
|
||||
result.index = self.columns
|
||||
return result
|
||||
|
||||
def _correct_dtypes(self):
|
||||
"""Corrects dtypes by concatenating column blocks and then splitting them
|
||||
apart back into the original blocks.
|
||||
|
||||
Also caches ObjectIDs for the dtypes of every column.
|
||||
|
||||
Args:
|
||||
block_partitions: arglist of column blocks.
|
||||
def _get_remote_dtypes(self):
|
||||
"""Finds and caches ObjectIDs for the dtypes of each column partition.
|
||||
"""
|
||||
if self._block_partitions.shape[0] > 1:
|
||||
self._block_partitions = np.array(
|
||||
[_correct_column_dtypes._submit(
|
||||
args=column, num_return_vals=len(column))
|
||||
for column in self._block_partitions.T]).T
|
||||
|
||||
self._dtypes_cache = [_deploy_func.remote(lambda df: df.dtypes, pd_df)
|
||||
for pd_df in self._block_partitions[0]]
|
||||
self._dtypes_cache = [_compile_remote_dtypes.remote(*column)
|
||||
for column in self._block_partitions.T]
|
||||
|
||||
@property
|
||||
def dtypes(self):
|
||||
@@ -584,7 +572,7 @@ class DataFrame(object):
|
||||
self._block_partitions[:, 0], index=index, axis=0)
|
||||
|
||||
# Update dtypes
|
||||
self._correct_dtypes()
|
||||
self._get_remote_dtypes()
|
||||
|
||||
def add_prefix(self, prefix):
|
||||
"""Add a prefix to each of the column names.
|
||||
|
||||
@@ -456,20 +456,15 @@ def _concat_index(*index_parts):
|
||||
return index_parts[0].append(index_parts[1:])
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _correct_column_dtypes(*column):
|
||||
"""Corrects dtypes of a column by concatenating column partitions and
|
||||
splitting the column back into partitions.
|
||||
|
||||
Args:
|
||||
"""
|
||||
concat_column = pd.concat(column, copy=False)
|
||||
return create_blocks_helper(concat_column, len(column), 1)
|
||||
|
||||
|
||||
def fix_blocks_dimensions(blocks, axis):
|
||||
"""Checks that blocks is 2D, and adds a dimension if not.
|
||||
"""
|
||||
if blocks.ndim < 2:
|
||||
return np.expand_dims(blocks, axis=axis ^ 1)
|
||||
return blocks
|
||||
|
||||
|
||||
@ray.remote
|
||||
def _compile_remote_dtypes(*column_of_blocks):
|
||||
small_dfs = [df.loc[0:0] for df in column_of_blocks]
|
||||
return pd.concat(small_dfs, copy=False).dtypes
|
||||
|
||||
Reference in New Issue
Block a user