diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index bbd219542..c0e09d8ef 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -21,6 +21,9 @@ import warnings import numpy as np import ray import itertools +import io +import sys +import re from .utils import ( _deploy_func, @@ -85,6 +88,7 @@ class DataFrame(object): axis = 0 columns = pd_df.columns index = pd_df.index + self._row_metadata = self._col_metadata = None else: # created this invariant to make sure we never have to go into the # partitions to get the columns @@ -158,13 +162,16 @@ class DataFrame(object): def __str__(self): return repr(self) - def __repr__(self): - if len(self._row_metadata) < 60: - result = repr(to_pandas(self)) - return result + def _repr_helper_(self): + if len(self._row_metadata) <= 60 and \ + len(self._col_metadata) <= 20: + return to_pandas(self) - def head(df, n): + def head(df, n, get_local_head=False): """Compute the head for this without creating a new DataFrame""" + if get_local_head: + return df.head(n) + new_dfs = _map_partitions(lambda df: df.head(n), df) @@ -174,8 +181,10 @@ class DataFrame(object): pd_head.columns = self.columns return pd_head - def tail(df, n): + def tail(df, n, get_local_tail=False): """Compute the tail for this without creating a new DataFrame""" + if get_local_tail: + return df.tail(n) new_dfs = _map_partitions(lambda df: df.tail(n), df) @@ -186,25 +195,91 @@ class DataFrame(object): pd_tail.columns = self.columns return pd_tail + def front(df, n): + """Get first n columns without creating a new Dataframe""" + + cum_col_lengths = self._col_metadata._lengths.cumsum() + index = np.argmax(cum_col_lengths >= 10) + pd_front = pd.concat(ray.get(x[:index+1]), axis=1, copy=False) + pd_front = pd_front.iloc[:, :n] + pd_front.index = self.index + pd_front.columns = self.columns[:n] + return pd_front + + def back(df, n): + """Get last n columns without creating a new Dataframe""" + + cum_col_lengths = np.flip(self._col_metadata._lengths, + axis=0).cumsum() + index = np.argmax(cum_col_lengths >= 10) + pd_back = pd.concat(ray.get(x[-(index+1):]), axis=1, copy=False) + pd_back = pd_back.iloc[:, -n:] + pd_back.index = self.index + pd_back.columns = self.columns[-n:] + return pd_back + x = self._col_partitions - head = head(x, 30) - tail = tail(x, 30) + get_local_head = False + + # Get first and last 10 columns if there are more than 20 columns + if len(self._col_metadata) >= 20: + get_local_head = True + front = front(x, 10) + back = back(x, 10) + + col_dots = pd.Series(["..." + for _ in range(len(self.index))]) + col_dots.index = self.index + col_dots.name = "..." + x = pd.concat([front, col_dots, back], axis=1) + + # If less than 60 rows, x is already in the correct format. + if len(self._row_metadata) < 60: + return x + + head = head(x, 30, get_local_head) + tail = tail(x, 30, get_local_head) # Make the dots in between the head and tail - dots = pd.Series(["..." - for _ in range(self._block_partitions.shape[1])]) - dots.index = head.columns - dots.name = "..." + row_dots = pd.Series(["..." + for _ in range(len(head.columns))]) + row_dots.index = head.columns + row_dots.name = "..." # We have to do it this way or convert dots to a dataframe and # transpose. This seems better. - result = head.append(dots).append(tail) + result = head.append(row_dots).append(tail) + return result + def __repr__(self): # We use pandas repr so that we match them. + if len(self._row_metadata) <= 60 and \ + len(self._col_metadata) <= 20: + return repr(self._repr_helper_()) # The split here is so that we don't repr pandas row lengths. - return repr(result).split("\n\n")[0] + \ - "\n\n[{0} rows X {1} columns]".format(len(self.index), + result = self._repr_helper_() + final_result = repr(result).rsplit("\n\n", maxsplit=1)[0] + \ + "\n\n[{0} rows x {1} columns]".format(len(self.index), len(self.columns)) + return final_result + + def _repr_html_(self): + """repr function for rendering in Jupyter Notebooks like Pandas + Dataframes. + + Returns: + The HTML representation of a Dataframe. + """ + # We use pandas _repr_html_ to get a string of the HTML representation + # of the dataframe. + if len(self._row_metadata) <= 60 and \ + len(self._col_metadata) <= 20: + return self._repr_helper_()._repr_html_() + # We split so that we insert our correct dataframe dimensions. + result = self._repr_helper_()._repr_html_() + return result.split('
')[0] + \ + '
{0} rows × {1} columns
\n'.format(len(self.index), + len(self.columns)) def _get_index(self): """Get the index for this DataFrame. @@ -258,9 +333,12 @@ class DataFrame(object): # We use the index to get the internal index. oid_series = [(oid_series[i], i) for i in range(len(oid_series))] - for df, partition in oid_series: - this_partition = self._col_metadata.partition_series(partition) - df.index = this_partition[this_partition.isin(df.index)].index + if len(oid_series) > 1: + for df, partition in oid_series: + this_partition = \ + self._col_metadata.partition_series(partition) + df.index = \ + this_partition[this_partition.isin(df.index)].index result_series = pd.concat([obj[0] for obj in oid_series], axis=0, copy=False) @@ -1514,9 +1592,74 @@ class DataFrame(object): def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + + def info_helper(df): + output_buffer = io.StringIO() + df.info(verbose=verbose, + buf=output_buffer, + max_cols=max_cols, + memory_usage=memory_usage, + null_counts=null_counts) + return output_buffer.getvalue() + + # Combine the per-partition info and split into lines + result = ''.join(ray.get(_map_partitions(info_helper, + self._col_partitions))) + lines = result.split('\n') + + # Class denoted in info() output + class_string = '