mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 19:01:10 +08:00
[DataFrame] Fixed repr, info, and memory_usage (#1874)
* working with dataframes with too many rows and columns * repr works for jupyter notebooks now * added comments and test file * added repr test file to .travis.yml * added back ray.dataframe as pd to test file * fixed pandas importing issues in test file * getting the front and back of df more efficiently * only keeping dataframe tests in travis * fixing numpy array for row and col lengths issue * doesn't add dimensions if df is small enough * implemented memory_usage() * completed memory_usage - still failing 2 tests * only failing one test for memory_usage * all repr and dataframes tests passing now * fixing error related to python2 in info() * fixing python2 errors * fixed linting errosr * using _arithmetic_helper in memory_usage() * fixed last lint error * removed testing-specific code * adding back travis test * removing extra tests from travis * re-added concat test * fixes with new indexing scheme * code cleanup * fully working with new indexing scheme * added tests for info and memory_usage * removed test file
This commit is contained in:
committed by
Devin Petersohn
parent
806b2c844e
commit
a3ddde398c
@@ -21,6 +21,9 @@ import warnings
|
||||
import numpy as np
|
||||
import ray
|
||||
import itertools
|
||||
import io
|
||||
import sys
|
||||
import re
|
||||
|
||||
from .utils import (
|
||||
_deploy_func,
|
||||
@@ -85,6 +88,7 @@ class DataFrame(object):
|
||||
axis = 0
|
||||
columns = pd_df.columns
|
||||
index = pd_df.index
|
||||
self._row_metadata = self._col_metadata = None
|
||||
else:
|
||||
# created this invariant to make sure we never have to go into the
|
||||
# partitions to get the columns
|
||||
@@ -158,13 +162,16 @@ class DataFrame(object):
|
||||
def __str__(self):
|
||||
return repr(self)
|
||||
|
||||
def __repr__(self):
|
||||
if len(self._row_metadata) < 60:
|
||||
result = repr(to_pandas(self))
|
||||
return result
|
||||
def _repr_helper_(self):
|
||||
if len(self._row_metadata) <= 60 and \
|
||||
len(self._col_metadata) <= 20:
|
||||
return to_pandas(self)
|
||||
|
||||
def head(df, n):
|
||||
def head(df, n, get_local_head=False):
|
||||
"""Compute the head for this without creating a new DataFrame"""
|
||||
if get_local_head:
|
||||
return df.head(n)
|
||||
|
||||
new_dfs = _map_partitions(lambda df: df.head(n),
|
||||
df)
|
||||
|
||||
@@ -174,8 +181,10 @@ class DataFrame(object):
|
||||
pd_head.columns = self.columns
|
||||
return pd_head
|
||||
|
||||
def tail(df, n):
|
||||
def tail(df, n, get_local_tail=False):
|
||||
"""Compute the tail for this without creating a new DataFrame"""
|
||||
if get_local_tail:
|
||||
return df.tail(n)
|
||||
|
||||
new_dfs = _map_partitions(lambda df: df.tail(n),
|
||||
df)
|
||||
@@ -186,25 +195,91 @@ class DataFrame(object):
|
||||
pd_tail.columns = self.columns
|
||||
return pd_tail
|
||||
|
||||
def front(df, n):
|
||||
"""Get first n columns without creating a new Dataframe"""
|
||||
|
||||
cum_col_lengths = self._col_metadata._lengths.cumsum()
|
||||
index = np.argmax(cum_col_lengths >= 10)
|
||||
pd_front = pd.concat(ray.get(x[:index+1]), axis=1, copy=False)
|
||||
pd_front = pd_front.iloc[:, :n]
|
||||
pd_front.index = self.index
|
||||
pd_front.columns = self.columns[:n]
|
||||
return pd_front
|
||||
|
||||
def back(df, n):
|
||||
"""Get last n columns without creating a new Dataframe"""
|
||||
|
||||
cum_col_lengths = np.flip(self._col_metadata._lengths,
|
||||
axis=0).cumsum()
|
||||
index = np.argmax(cum_col_lengths >= 10)
|
||||
pd_back = pd.concat(ray.get(x[-(index+1):]), axis=1, copy=False)
|
||||
pd_back = pd_back.iloc[:, -n:]
|
||||
pd_back.index = self.index
|
||||
pd_back.columns = self.columns[-n:]
|
||||
return pd_back
|
||||
|
||||
x = self._col_partitions
|
||||
head = head(x, 30)
|
||||
tail = tail(x, 30)
|
||||
get_local_head = False
|
||||
|
||||
# Get first and last 10 columns if there are more than 20 columns
|
||||
if len(self._col_metadata) >= 20:
|
||||
get_local_head = True
|
||||
front = front(x, 10)
|
||||
back = back(x, 10)
|
||||
|
||||
col_dots = pd.Series(["..."
|
||||
for _ in range(len(self.index))])
|
||||
col_dots.index = self.index
|
||||
col_dots.name = "..."
|
||||
x = pd.concat([front, col_dots, back], axis=1)
|
||||
|
||||
# If less than 60 rows, x is already in the correct format.
|
||||
if len(self._row_metadata) < 60:
|
||||
return x
|
||||
|
||||
head = head(x, 30, get_local_head)
|
||||
tail = tail(x, 30, get_local_head)
|
||||
|
||||
# Make the dots in between the head and tail
|
||||
dots = pd.Series(["..."
|
||||
for _ in range(self._block_partitions.shape[1])])
|
||||
dots.index = head.columns
|
||||
dots.name = "..."
|
||||
row_dots = pd.Series(["..."
|
||||
for _ in range(len(head.columns))])
|
||||
row_dots.index = head.columns
|
||||
row_dots.name = "..."
|
||||
|
||||
# We have to do it this way or convert dots to a dataframe and
|
||||
# transpose. This seems better.
|
||||
result = head.append(dots).append(tail)
|
||||
result = head.append(row_dots).append(tail)
|
||||
return result
|
||||
|
||||
def __repr__(self):
|
||||
# We use pandas repr so that we match them.
|
||||
if len(self._row_metadata) <= 60 and \
|
||||
len(self._col_metadata) <= 20:
|
||||
return repr(self._repr_helper_())
|
||||
# The split here is so that we don't repr pandas row lengths.
|
||||
return repr(result).split("\n\n")[0] + \
|
||||
"\n\n[{0} rows X {1} columns]".format(len(self.index),
|
||||
result = self._repr_helper_()
|
||||
final_result = repr(result).rsplit("\n\n", maxsplit=1)[0] + \
|
||||
"\n\n[{0} rows x {1} columns]".format(len(self.index),
|
||||
len(self.columns))
|
||||
return final_result
|
||||
|
||||
def _repr_html_(self):
|
||||
"""repr function for rendering in Jupyter Notebooks like Pandas
|
||||
Dataframes.
|
||||
|
||||
Returns:
|
||||
The HTML representation of a Dataframe.
|
||||
"""
|
||||
# We use pandas _repr_html_ to get a string of the HTML representation
|
||||
# of the dataframe.
|
||||
if len(self._row_metadata) <= 60 and \
|
||||
len(self._col_metadata) <= 20:
|
||||
return self._repr_helper_()._repr_html_()
|
||||
# We split so that we insert our correct dataframe dimensions.
|
||||
result = self._repr_helper_()._repr_html_()
|
||||
return result.split('<p>')[0] + \
|
||||
'<p>{0} rows × {1} columns</p>\n</div>'.format(len(self.index),
|
||||
len(self.columns))
|
||||
|
||||
def _get_index(self):
|
||||
"""Get the index for this DataFrame.
|
||||
@@ -258,9 +333,12 @@ class DataFrame(object):
|
||||
# We use the index to get the internal index.
|
||||
oid_series = [(oid_series[i], i) for i in range(len(oid_series))]
|
||||
|
||||
for df, partition in oid_series:
|
||||
this_partition = self._col_metadata.partition_series(partition)
|
||||
df.index = this_partition[this_partition.isin(df.index)].index
|
||||
if len(oid_series) > 1:
|
||||
for df, partition in oid_series:
|
||||
this_partition = \
|
||||
self._col_metadata.partition_series(partition)
|
||||
df.index = \
|
||||
this_partition[this_partition.isin(df.index)].index
|
||||
|
||||
result_series = pd.concat([obj[0] for obj in oid_series],
|
||||
axis=0, copy=False)
|
||||
@@ -1514,9 +1592,74 @@ class DataFrame(object):
|
||||
|
||||
def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
|
||||
null_counts=None):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def info_helper(df):
|
||||
output_buffer = io.StringIO()
|
||||
df.info(verbose=verbose,
|
||||
buf=output_buffer,
|
||||
max_cols=max_cols,
|
||||
memory_usage=memory_usage,
|
||||
null_counts=null_counts)
|
||||
return output_buffer.getvalue()
|
||||
|
||||
# Combine the per-partition info and split into lines
|
||||
result = ''.join(ray.get(_map_partitions(info_helper,
|
||||
self._col_partitions)))
|
||||
lines = result.split('\n')
|
||||
|
||||
# Class denoted in info() output
|
||||
class_string = '<class \'ray.dataframe.dataframe.DataFrame\'>\n'
|
||||
|
||||
# Create the Index info() string by parsing self.index
|
||||
index_string = self.index.summary() + '\n'
|
||||
|
||||
# A column header is needed in the inf() output
|
||||
col_header = 'Data columns (total {0} columns):\n'.format(
|
||||
len(self.columns))
|
||||
|
||||
# Parse the per-partition values to get the per-column details
|
||||
# Find all the lines in the output that start with integers
|
||||
prog = re.compile('^[0-9]+.+')
|
||||
col_lines = [prog.match(line) for line in lines]
|
||||
cols = [c.group(0) for c in col_lines if c is not None]
|
||||
# replace the partition columns names with real column names
|
||||
columns = ["{0}\t{1}\n".format(self.columns[i],
|
||||
cols[i].split(" ", 1)[1])
|
||||
for i in range(len(cols))]
|
||||
col_string = ''.join(columns) + '\n'
|
||||
|
||||
# A summary of the dtypes in the dataframe
|
||||
dtypes_string = "dtypes: "
|
||||
for dtype, count in self.dtypes.value_counts().iteritems():
|
||||
dtypes_string += "{0}({1}),".format(dtype, count)
|
||||
dtypes_string = dtypes_string[:-1] + '\n'
|
||||
|
||||
# Compute the memory usage by summing per-partitions return values
|
||||
# Parse lines for memory usage number
|
||||
prog = re.compile('^memory+.+')
|
||||
mems = [prog.match(line) for line in lines]
|
||||
mem_vals = [float(re.search(r'\d+', m.group(0)).group())
|
||||
for m in mems if m is not None]
|
||||
|
||||
memory_string = ""
|
||||
|
||||
if len(mem_vals) != 0:
|
||||
# Sum memory usage from each partition
|
||||
if memory_usage != 'deep':
|
||||
memory_string = 'memory usage: {0}+ bytes'.format(
|
||||
sum(mem_vals))
|
||||
else:
|
||||
memory_string = 'memory usage: {0} bytes'.format(sum(mem_vals))
|
||||
|
||||
# Combine all the components of the info() output
|
||||
result = ''.join([class_string, index_string, col_header,
|
||||
col_string, dtypes_string, memory_string])
|
||||
|
||||
# Write to specified output buffer
|
||||
if buf:
|
||||
buf.write(result)
|
||||
else:
|
||||
sys.stdout.write(result)
|
||||
|
||||
def insert(self, loc, column, value, allow_duplicates=False):
|
||||
"""Insert column into DataFrame at specified location.
|
||||
@@ -1552,9 +1695,6 @@ class DataFrame(object):
|
||||
df.insert(index_within_partition, column, value, allow_duplicates)
|
||||
return df
|
||||
|
||||
print('partition:', partition)
|
||||
print('i_w_partition', index_within_partition)
|
||||
print('df:\n', ray.get(self._col_partitions[partition]))
|
||||
new_obj = _deploy_func.remote(insert_col_part,
|
||||
self._col_partitions[partition])
|
||||
new_cols = [self._col_partitions[i]
|
||||
@@ -1781,9 +1921,18 @@ class DataFrame(object):
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def memory_usage(self, index=True, deep=False):
|
||||
raise NotImplementedError(
|
||||
"To contribute to Pandas on Ray, please visit "
|
||||
"github.com/ray-project/ray.")
|
||||
|
||||
def remote_func(df):
|
||||
return df.memory_usage(index=False, deep=deep)
|
||||
|
||||
result = self._arithmetic_helper(remote_func, axis=0)
|
||||
|
||||
result.index = self.columns
|
||||
if index:
|
||||
index_value = self._row_metadata.index.memory_usage(deep=deep)
|
||||
return pd.Series(index_value, index=['Index']).append(result)
|
||||
|
||||
return result
|
||||
|
||||
def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
|
||||
left_index=False, right_index=False, sort=False,
|
||||
|
||||
@@ -1648,11 +1648,12 @@ def test_infer_objects():
|
||||
ray_df.infer_objects()
|
||||
|
||||
|
||||
def test_info():
|
||||
ray_df = create_test_dataframe()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_df.info()
|
||||
@pytest.fixture
|
||||
def test_info(ray_df):
|
||||
info_string = ray_df.info()
|
||||
assert '<class \'ray.dataframe.dataframe.DataFrame\'>\n' in info_string
|
||||
info_string = ray_df.info(memory_usage=True)
|
||||
assert 'memory_usage: ' in info_string
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -1815,11 +1816,12 @@ def test_melt():
|
||||
ray_df.melt()
|
||||
|
||||
|
||||
def test_memory_usage():
|
||||
ray_df = create_test_dataframe()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
ray_df.memory_usage()
|
||||
@pytest.fixture
|
||||
def test_memory_usage(ray_df):
|
||||
assert type(ray_df.memory_usage()) is pd.core.series.Series
|
||||
assert ray_df.memory_usage(index=True).at['Index'] is not None
|
||||
assert ray_df.memory_usage(deep=True).sum() >= \
|
||||
ray_df.memory_usage(deep=False).sum()
|
||||
|
||||
|
||||
def test_merge():
|
||||
|
||||
@@ -72,8 +72,9 @@ def _partition_pandas_dataframe(df, num_partitions=None, row_chunksize=None):
|
||||
row_partitions.append(top)
|
||||
temp_df = temp_df[row_chunksize:]
|
||||
else:
|
||||
temp_df.reset_index(drop=True, inplace=True)
|
||||
temp_df.columns = pd.RangeIndex(0, len(temp_df.columns))
|
||||
if len(df) > row_chunksize:
|
||||
temp_df.reset_index(drop=True, inplace=True)
|
||||
temp_df.columns = pd.RangeIndex(0, len(temp_df.columns))
|
||||
row_partitions.append(ray.put(temp_df))
|
||||
|
||||
return row_partitions
|
||||
@@ -160,8 +161,9 @@ def _map_partitions(func, partitions, *argslists):
|
||||
def _build_columns(df_col, columns):
|
||||
"""Build columns and compute lengths for each partition."""
|
||||
# Columns and width
|
||||
widths = ray.get([_deploy_func.remote(lambda df: len(df.columns), d)
|
||||
for d in df_col])
|
||||
widths = np.array(ray.get([_deploy_func.remote(lambda df: len(df.columns),
|
||||
d)
|
||||
for d in df_col]))
|
||||
dest_indices = [(p_idx, p_sub_idx) for p_idx in range(len(widths))
|
||||
for p_sub_idx in range(widths[p_idx])]
|
||||
|
||||
@@ -175,8 +177,8 @@ def _build_columns(df_col, columns):
|
||||
def _build_index(df_row, index):
|
||||
"""Build index and compute lengths for each partition."""
|
||||
# Rows and length
|
||||
lengths = ray.get([_deploy_func.remote(_get_lengths, d)
|
||||
for d in df_row])
|
||||
lengths = np.array(ray.get([_deploy_func.remote(_get_lengths, d)
|
||||
for d in df_row]))
|
||||
|
||||
dest_indices = [(p_idx, p_sub_idx) for p_idx in range(len(lengths))
|
||||
for p_sub_idx in range(lengths[p_idx])]
|
||||
|
||||
Reference in New Issue
Block a user