From 4185aaed10ce4e416bdfe519b64e24c5eae21163 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 6 Jul 2018 00:16:22 -0700 Subject: [PATCH] Dataframe deprecation (#2353) --- .travis.yml | 12 - .travis/upgrade-syn.sh | 1 - .travis/yapf.sh | 1 - doc/source/pandas_on_ray.rst | 70 +- python/ray/dataframe/__init__.py | 62 +- python/ray/dataframe/concat.py | 133 - python/ray/dataframe/dataframe.py | 5413 ------------------- python/ray/dataframe/datetimes.py | 64 - python/ray/dataframe/groupby.py | 581 -- python/ray/dataframe/index_metadata.py | 415 -- python/ray/dataframe/indexing.py | 469 -- python/ray/dataframe/io.py | 524 -- python/ray/dataframe/iterator.py | 29 - python/ray/dataframe/pandas_code_gen.py | 94 - python/ray/dataframe/reshape.py | 125 - python/ray/dataframe/series.py | 970 ---- python/ray/dataframe/test/test_concat.py | 127 - python/ray/dataframe/test/test_dataframe.py | 3353 ------------ python/ray/dataframe/test/test_groupby.py | 589 -- python/ray/dataframe/test/test_io.py | 675 --- python/ray/dataframe/test/test_series.py | 1992 ------- python/ray/dataframe/utils.py | 577 -- 22 files changed, 6 insertions(+), 16270 deletions(-) delete mode 100644 python/ray/dataframe/concat.py delete mode 100644 python/ray/dataframe/dataframe.py delete mode 100644 python/ray/dataframe/datetimes.py delete mode 100644 python/ray/dataframe/groupby.py delete mode 100644 python/ray/dataframe/index_metadata.py delete mode 100644 python/ray/dataframe/indexing.py delete mode 100644 python/ray/dataframe/io.py delete mode 100644 python/ray/dataframe/iterator.py delete mode 100644 python/ray/dataframe/pandas_code_gen.py delete mode 100644 python/ray/dataframe/reshape.py delete mode 100644 python/ray/dataframe/series.py delete mode 100644 python/ray/dataframe/test/test_concat.py delete mode 100644 python/ray/dataframe/test/test_dataframe.py delete mode 100644 python/ray/dataframe/test/test_groupby.py delete mode 100644 python/ray/dataframe/test/test_io.py delete mode 100644 python/ray/dataframe/test/test_series.py delete mode 100644 python/ray/dataframe/utils.py diff --git a/.travis.yml b/.travis.yml index 466c5a6f8..11479bcf1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -143,12 +143,6 @@ matrix: - python test/cython_test.py - python test/credis_test.py - # ray dataframe tests - # - python -m pytest python/ray/dataframe/test/test_dataframe.py - - python -m pytest python/ray/dataframe/test/test_concat.py - - python -m pytest python/ray/dataframe/test/test_io.py - # - python -m pytest python/ray/dataframe/test/test_groupby.py - # ray tune tests - python python/ray/tune/test/dependency_test.py - python -m pytest python/ray/tune/test/trial_runner_test.py @@ -208,12 +202,6 @@ script: - python test/cython_test.py - python test/credis_test.py - # ray dataframe tests - - python -m pytest python/ray/dataframe/test/test_dataframe.py - - python -m pytest python/ray/dataframe/test/test_concat.py - - python -m pytest python/ray/dataframe/test/test_io.py - - python -m pytest python/ray/dataframe/test/test_groupby.py - # ray tune tests - python python/ray/tune/test/dependency_test.py - python -m pytest python/ray/tune/test/trial_runner_test.py diff --git a/.travis/upgrade-syn.sh b/.travis/upgrade-syn.sh index 8bfaacd39..a7f7cd8c6 100755 --- a/.travis/upgrade-syn.sh +++ b/.travis/upgrade-syn.sh @@ -13,7 +13,6 @@ find \ python test \ -name '*.py' -type f \ -not -path 'python/ray/cloudpickle/*' \ - -not -path 'python/ray/dataframe/*' \ -exec python -m pyupgrade {} + if ! git diff --quiet; then diff --git a/.travis/yapf.sh b/.travis/yapf.sh index 45cabf587..7c12ce4b4 100755 --- a/.travis/yapf.sh +++ b/.travis/yapf.sh @@ -24,7 +24,6 @@ YAPF_FLAGS=( ) YAPF_EXCLUDES=( - '--exclude' 'python/ray/dataframe/*' '--exclude' 'python/ray/rllib/*' '--exclude' 'python/ray/cloudpickle/*' '--exclude' 'python/build/*' diff --git a/doc/source/pandas_on_ray.rst b/doc/source/pandas_on_ray.rst index 59d2fc699..4378d12e4 100644 --- a/doc/source/pandas_on_ray.rst +++ b/doc/source/pandas_on_ray.rst @@ -1,71 +1,9 @@ Pandas on Ray ============= -Pandas on Ray is an early stage DataFrame library that wraps Pandas and -transparently distributes the data and computation. The user does not need to -know how many cores their system has, nor do they need to specify how to -distribute the data. In fact, users can continue using their previous Pandas -notebooks while experiencing a considerable speedup from Pandas on Ray, even -on a single machine. Only a modification of the import statement is needed, as -we demonstrate below. Once you’ve changed your import statement, you’re ready -to use Pandas on Ray just like you would Pandas. +**Pandas on Ray has moved to Modin!** -.. code-block:: python +Pandas on Ray has moved into the `Modin project`_ with the intention of +unifying the DataFrame APIs. - # import pandas as pd - import ray.dataframe as pd - -Currently, we have part of the Pandas API implemented and are working toward -full functional parity with Pandas. - -Using Pandas on Ray on a Single Node ------------------------------------- - -In order to use the most up-to-date version of Pandas on Ray, please follow -the instructions on the `installation page`_ - -Once you import the library, you should see something similar to the following -output: - -.. code-block:: text - - >>> import ray.dataframe as pd - - Waiting for redis server at 127.0.0.1:14618 to respond... - Waiting for redis server at 127.0.0.1:31410 to respond... - Starting local scheduler with the following resources: {'CPU': 4, 'GPU': 0}. - - ====================================================================== - View the web UI at http://localhost:8889/notebooks/ray_ui36796.ipynb?token=ac25867d62c4ae87941bc5a0ecd5f517dbf80bd8e9b04218 - ====================================================================== - -If you do not see output similar to the above, please make sure that you have -built Ray using the instructions on the `installation page`_ - -One you have executed ``import ray.dataframe as pd``, you're ready to begin -running your Pandas pipeline as you were before. Please note, the API is not -yet complete. For some methods, you may see the following: - -.. code-block:: text - - NotImplementedError: To contribute to Pandas on Ray, please visit github.com/ray-project/ray. - -If you would like to request a particular method be implemented, feel free to -`open an issue`_. Before you open an issue please make sure that someone else -has not already requested that functionality. - -Using Pandas on Ray on a Cluster --------------------------------- - -Currently, we do not yet support running Pandas on Ray on a cluster. Coming -Soon! - -Examples --------- -You can find an example on our recent `blog post`_ or on the -`Jupyter Notebook`_ that we used to create the blog post. - -.. _`installation page`: http://ray.readthedocs.io/en/latest/installation.html -.. _`open an issue`: http://github.com/ray-project/ray/issues -.. _`blog post`: http://rise.cs.berkeley.edu/blog/pandas-on-ray -.. _`Jupyter Notebook`: http://gist.github.com/devin-petersohn/f424d9fb5579a96507c709a36d487f24#file-pandas_on_ray_blog_post_0-ipynb +.. _`Modin project`: https://github.com/modin-project/modin diff --git a/python/ray/dataframe/__init__.py b/python/ray/dataframe/__init__.py index d71ac1e3d..a22f7ff61 100644 --- a/python/ray/dataframe/__init__.py +++ b/python/ray/dataframe/__init__.py @@ -1,60 +1,2 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import pandas -# TODO: In the future `set_option` or similar needs to run on every node -# in order to keep all pandas instances across nodes consistent -from pandas import (eval, unique, value_counts, cut, to_numeric, factorize, - test, qcut, match, Panel, date_range, Index, MultiIndex, - CategoricalIndex, Series, bdate_range, DatetimeIndex, - Timedelta, Timestamp, to_timedelta, set_eng_float_format, - set_option, NaT, PeriodIndex, Categorical) -import threading - -pandas_version = pandas.__version__ -pandas_major = int(pandas_version.split(".")[0]) -pandas_minor = int(pandas_version.split(".")[1]) - -if pandas_major == 0 and pandas_minor != 22: - raise Exception("In order to use Pandas on Ray, your pandas version must " - "be 0.22. You can run 'pip install pandas==0.22'") - -DEFAULT_NPARTITIONS = 8 - - -def set_npartition_default(n): - global DEFAULT_NPARTITIONS - DEFAULT_NPARTITIONS = n - - -def get_npartitions(): - return DEFAULT_NPARTITIONS - - -# We import these file after above two function -# because they depend on npartitions. -from .dataframe import DataFrame # noqa: 402 -from .io import (read_csv, read_parquet, read_json, read_html, # noqa: 402 - read_clipboard, read_excel, read_hdf, read_feather, # noqa: 402 - read_msgpack, read_stata, read_sas, read_pickle, # noqa: 402 - read_sql) # noqa: 402 -from .concat import concat # noqa: 402 -from .datetimes import to_datetime # noqa: 402 -from .reshape import get_dummies # noqa: 402 - -__all__ = [ - "DataFrame", "Series", "read_csv", "read_parquet", "concat", "eval", - "unique", "value_counts", "cut", "to_numeric", "factorize", "test", "qcut", - "match", "to_datetime", "get_dummies", "Panel", "date_range", "Index", - "MultiIndex", "Series", "bdate_range", "DatetimeIndex", "to_timedelta", - "set_eng_float_format", "set_option", "CategoricalIndex", "Timedelta", - "Timestamp", "NaT", "PeriodIndex", "Categorical" -] - -try: - if threading.current_thread().name == "MainThread": - import ray - ray.init() -except AssertionError: - pass +raise DeprecationWarning("Pandas on Ray has moved to Modin: " + "github.com/modin-project/modin") diff --git a/python/ray/dataframe/concat.py b/python/ray/dataframe/concat.py deleted file mode 100644 index ef3686284..000000000 --- a/python/ray/dataframe/concat.py +++ /dev/null @@ -1,133 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import pandas -import numpy as np -from .dataframe import DataFrame -from .utils import _reindex_helper - - -def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False, - copy=True): - - if keys is not None: - objs = [objs[k] for k in keys] - else: - objs = list(objs) - - if len(objs) == 0: - raise ValueError("No objects to concatenate") - - objs = [obj for obj in objs if obj is not None] - - if len(objs) == 0: - raise ValueError("All objects passed were None") - - try: - type_check = next(obj for obj in objs - if not isinstance(obj, (pandas.Series, - pandas.DataFrame, - DataFrame))) - except StopIteration: - type_check = None - if type_check is not None: - raise ValueError("cannot concatenate object of type \"{0}\"; only " - "pandas.Series, pandas.DataFrame, " - "and ray.dataframe.DataFrame objs are " - "valid", type(type_check)) - - all_series = all(isinstance(obj, pandas.Series) - for obj in objs) - if all_series: - return DataFrame(pandas.concat(objs, axis, join, join_axes, - ignore_index, keys, levels, names, - verify_integrity, copy)) - - if isinstance(objs, dict): - raise NotImplementedError( - "Obj as dicts not implemented. To contribute to " - "Pandas on Ray, please visit github.com/ray-project/ray.") - - axis = pandas.DataFrame()._get_axis_number(axis) - - if join not in ['inner', 'outer']: - raise ValueError("Only can inner (intersect) or outer (union) join the" - " other axis") - - # We need this in a list because we use it later. - all_index, all_columns = list(zip(*[(obj.index, obj.columns) - for obj in objs])) - - def series_to_df(series, columns): - df = pandas.DataFrame(series) - df.columns = columns - return DataFrame(df) - - # Pandas puts all of the Series in a single column named 0. This is - # true regardless of the existence of another column named 0 in the - # concat. - if axis == 0: - objs = [series_to_df(obj, [0]) - if isinstance(obj, pandas.Series) else obj for obj in objs] - else: - # Pandas starts the count at 0 so this will increment the names as - # long as there's a new nameless Series being added. - def name_incrementer(i): - val = i[0] - i[0] += 1 - return val - - i = [0] - objs = [series_to_df(obj, obj.name if obj.name is not None - else name_incrementer(i)) - if isinstance(obj, pandas.Series) else obj for obj in objs] - - # Using concat on the columns and index is fast because they're empty, - # and it forces the error checking. It also puts the columns in the - # correct order for us. - final_index = \ - pandas.concat([pandas.DataFrame(index=idx) for idx in all_index], - axis=axis, join=join, join_axes=join_axes, - ignore_index=ignore_index, keys=keys, levels=levels, - names=names, verify_integrity=verify_integrity, - copy=False).index - final_columns = \ - pandas.concat([pandas.DataFrame(columns=col) - for col in all_columns], - axis=axis, join=join, join_axes=join_axes, - ignore_index=ignore_index, keys=keys, levels=levels, - names=names, verify_integrity=verify_integrity, - copy=False).columns - - # Put all of the DataFrames into Ray format - # TODO just partition the DataFrames instead of building a new Ray DF. - objs = [DataFrame(obj) if isinstance(obj, (pandas.DataFrame, - pandas.Series)) else obj - for obj in objs] - - # Here we reuse all_columns/index so we don't have to materialize objects - # from remote memory built in the previous line. In the future, we won't be - # building new DataFrames, rather just partitioning the DataFrames. - if axis == 0: - new_blocks = np.array([_reindex_helper._submit( - args=tuple([all_columns[i], final_columns, axis, - len(objs[0]._block_partitions)] + part.tolist()), - num_return_vals=len(objs[0]._block_partitions)) - for i in range(len(objs)) - for part in objs[i]._block_partitions]) - else: - # Transposing the columns is necessary because the remote task treats - # everything like rows and returns in row-major format. Luckily, this - # operation is cheap in numpy. - new_blocks = np.array([_reindex_helper._submit( - args=tuple([all_index[i], final_index, axis, - len(objs[0]._block_partitions.T)] + part.tolist()), - num_return_vals=len(objs[0]._block_partitions.T)) - for i in range(len(objs)) - for part in objs[i]._block_partitions.T]).T - - return DataFrame(block_partitions=new_blocks, - columns=final_columns, - index=final_index) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py deleted file mode 100644 index 590f4a993..000000000 --- a/python/ray/dataframe/dataframe.py +++ /dev/null @@ -1,5413 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import pandas -import functools -from pandas.api.types import is_scalar -from pandas.util._validators import validate_bool_kwarg -from pandas.core.index import _ensure_index_from_sequences -from pandas._libs import lib -from pandas.core.dtypes.cast import maybe_upcast_putmask -from pandas import compat -from pandas.compat import lzip, to_str, string_types, cPickle as pkl -import pandas.core.common as com -from pandas.core.dtypes.common import ( - is_bool_dtype, - is_list_like, - is_numeric_dtype, - is_timedelta64_dtype, - _get_dtype_from_object) -from pandas.core.indexing import check_bool_indexer -from pandas.errors import MergeError - -import warnings -import numpy as np -from numpy.testing import assert_equal -import ray -import itertools -import io -import sys -import re - -from .utils import ( - _deploy_func, - _map_partitions, - _partition_pandas_dataframe, - to_pandas, - create_blocks_helper, - _blocks_to_col, - _blocks_to_row, - _create_block_partitions, - _inherit_docstrings, - _reindex_helper, - _co_op_helper, - _match_partitioning, - _concat_index, - fix_blocks_dimensions, - _compile_remote_dtypes) -from . import get_npartitions -from .index_metadata import _IndexMetadata -from .iterator import PartitionIterator - - -@_inherit_docstrings(pandas.DataFrame, - excluded=[pandas.DataFrame, pandas.DataFrame.__init__]) -class DataFrame(object): - - def __init__(self, data=None, index=None, columns=None, dtype=None, - copy=False, col_partitions=None, row_partitions=None, - block_partitions=None, row_metadata=None, col_metadata=None, - dtypes_cache=None): - """Distributed DataFrame object backed by Pandas dataframes. - - Args: - data (numpy ndarray (structured or homogeneous) or dict): - Dict can contain Series, arrays, constants, or list-like - objects. - index (pandas.Index, list, ObjectID): The row index for this - dataframe. - columns (pandas.Index): The column names for this dataframe, in - pandas Index object. - dtype: Data type to force. Only a single dtype is allowed. - If None, infer - copy (boolean): Copy data from inputs. - Only affects DataFrame / 2d ndarray input - col_partitions ([ObjectID]): The list of ObjectIDs that contain - the column dataframe partitions. - row_partitions ([ObjectID]): The list of ObjectIDs that contain the - row dataframe partitions. - block_partitions: A 2D numpy array of block partitions. - row_metadata (_IndexMetadata): - Metadata for the new dataframe's rows - col_metadata (_IndexMetadata): - Metadata for the new dataframe's columns - """ - if isinstance(data, DataFrame): - self._frame_data = data._frame_data - return - - self._dtypes_cache = dtypes_cache - - # Check type of data and use appropriate constructor - if data is not None or (col_partitions is None and - row_partitions is None and - block_partitions is None): - - pandas_df = pandas.DataFrame(data=data, index=index, - columns=columns, dtype=dtype, - copy=copy) - - # Cache dtypes - self._dtypes_cache = pandas_df.dtypes - - # TODO convert _partition_pandas_dataframe to block partitioning. - row_partitions = \ - _partition_pandas_dataframe(pandas_df, - num_partitions=get_npartitions()) - - self._block_partitions = \ - _create_block_partitions(row_partitions, axis=0, - length=len(pandas_df.columns)) - - # Set in case we were only given a single row/column for below. - axis = 0 - columns = pandas_df.columns - index = pandas_df.index - else: - # created this invariant to make sure we never have to go into the - # partitions to get the columns - assert columns is not None or col_metadata is not None, \ - "Columns not defined, must define columns or col_metadata " \ - "for internal DataFrame creations" - - if block_partitions is not None: - axis = 0 - # put in numpy array here to make accesses easier since it's 2D - self._block_partitions = np.array(block_partitions) - self._block_partitions = \ - fix_blocks_dimensions(self._block_partitions, axis) - - else: - if row_partitions is not None: - axis = 0 - partitions = row_partitions - axis_length = len(columns) if columns is not None else \ - len(col_metadata) - elif col_partitions is not None: - axis = 1 - partitions = col_partitions - axis_length = len(index) if index is not None else \ - len(row_metadata) - # All partitions will already have correct dtypes - self._dtypes_cache = [ - _deploy_func.remote(lambda df: df.dtypes, pandas_df) - for pandas_df in col_partitions - ] - - # TODO: write explicit tests for "short and wide" - # column partitions - self._block_partitions = \ - _create_block_partitions(partitions, axis=axis, - length=axis_length) - - assert self._block_partitions.ndim == 2, "Block Partitions must be 2D." - - # Create the row and column index objects for using our partitioning. - # If the objects haven't been inherited, then generate them - if row_metadata is not None: - self._row_metadata = row_metadata.copy() - if index is not None: - self.index = index - else: - self._row_metadata = _IndexMetadata(self._block_partitions[:, 0], - index=index, axis=0) - - if col_metadata is not None: - self._col_metadata = col_metadata.copy() - if columns is not None: - self.columns = columns - else: - self._col_metadata = _IndexMetadata(self._block_partitions[0, :], - index=columns, axis=1) - - if self._dtypes_cache is None: - self._get_remote_dtypes() - - def _get_frame_data(self): - data = {} - data['blocks'] = self._block_partitions - data['col_metadata'] = self._col_metadata - data['row_metadata'] = self._row_metadata - data['columns'] = self.columns - data['index'] = self.index - data['dtypes'] = self._dtypes_cache - - return data - - def _set_frame_data(self, data): - self._block_partitions = data['blocks'] - self._col_metadata = data['col_metadata'] - self._row_metadata = data['row_metadata'] - self.columns = data['columns'] - self.index = data['index'] - self._dtypes_cache = data['dtypes'] - - _frame_data = property(_get_frame_data, _set_frame_data) - - def _get_row_partitions(self): - empty_rows_mask = self._row_metadata._lengths > 0 - if any(empty_rows_mask): - self._row_metadata._lengths = \ - self._row_metadata._lengths[empty_rows_mask] - self._block_partitions = self._block_partitions[empty_rows_mask, :] - return [_blocks_to_row.remote(*part) - for i, part in enumerate(self._block_partitions)] - - def _set_row_partitions(self, new_row_partitions): - self._block_partitions = \ - _create_block_partitions(new_row_partitions, axis=0, - length=len(self.columns)) - - _row_partitions = property(_get_row_partitions, _set_row_partitions) - - def _get_col_partitions(self): - empty_cols_mask = self._col_metadata._lengths > 0 - if any(empty_cols_mask): - self._col_metadata._lengths = \ - self._col_metadata._lengths[empty_cols_mask] - self._block_partitions = self._block_partitions[:, empty_cols_mask] - return [_blocks_to_col.remote(*self._block_partitions[:, i]) - for i in range(self._block_partitions.shape[1])] - - def _set_col_partitions(self, new_col_partitions): - self._block_partitions = \ - _create_block_partitions(new_col_partitions, axis=1, - length=len(self.index)) - - _col_partitions = property(_get_col_partitions, _set_col_partitions) - - def __str__(self): - return repr(self) - - def _repr_helper_(self): - if len(self._row_metadata) <= 60 and \ - len(self._col_metadata) <= 20: - return to_pandas(self) - - def head(df, n, get_local_head=False): - """Compute the head for this without creating a new DataFrame""" - if get_local_head: - return df.head(n) - - new_dfs = _map_partitions(lambda df: df.head(n), - df) - - index = self.index[:n] - pandas_head = pandas.concat(ray.get(new_dfs), axis=1, copy=False) - pandas_head.index = index - pandas_head.columns = self.columns - return pandas_head - - def tail(df, n, get_local_tail=False): - """Compute the tail for this without creating a new DataFrame""" - if get_local_tail: - return df.tail(n) - - new_dfs = _map_partitions(lambda df: df.tail(n), - df) - - index = self.index[-n:] - pandas_tail = pandas.concat(ray.get(new_dfs), axis=1, copy=False) - pandas_tail.index = index - pandas_tail.columns = self.columns - return pandas_tail - - def front(df, n): - """Get first n columns without creating a new Dataframe""" - - cum_col_lengths = self._col_metadata._lengths.cumsum() - index = np.argmax(cum_col_lengths >= 10) - pandas_front = pandas.concat(ray.get(x[:index + 1]), - axis=1, copy=False) - pandas_front = pandas_front.iloc[:, :n] - pandas_front.index = self.index - pandas_front.columns = self.columns[:n] - return pandas_front - - def back(df, n): - """Get last n columns without creating a new Dataframe""" - - cum_col_lengths = np.flip(self._col_metadata._lengths, - axis=0).cumsum() - index = np.argmax(cum_col_lengths >= 10) - pandas_back = pandas.concat(ray.get(x[-(index + 1):]), - axis=1, copy=False) - pandas_back = pandas_back.iloc[:, -n:] - pandas_back.index = self.index - pandas_back.columns = self.columns[-n:] - return pandas_back - - x = self._col_partitions - get_local_head = False - - # Get first and last 10 columns if there are more than 20 columns - if len(self._col_metadata) >= 20: - get_local_head = True - front = front(x, 10) - back = back(x, 10) - - col_dots = pandas.Series(["..." for _ in range(len(self.index))]) - col_dots.index = self.index - col_dots.name = "..." - x = pandas.concat([front, col_dots, back], axis=1, copy=False) - - # If less than 60 rows, x is already in the correct format. - if len(self._row_metadata) < 60: - return x - - head = head(x, 30, get_local_head) - tail = tail(x, 30, get_local_head) - - # Make the dots in between the head and tail - row_dots = pandas.Series(["..." for _ in range(len(head.columns))]) - row_dots.index = head.columns - row_dots.name = "..." - - # We have to do it this way or convert dots to a dataframe and - # transpose. This seems better. - result = head.append(row_dots).append(tail) - return result - - def __repr__(self): - # We use pandas repr so that we match them. - if len(self._row_metadata) <= 60 and \ - len(self._col_metadata) <= 20: - return repr(self._repr_helper_()) - # The split here is so that we don't repr pandas row lengths. - result = self._repr_helper_() - final_result = repr(result).rsplit("\n\n", maxsplit=1)[0] + \ - "\n\n[{0} rows x {1} columns]".format(len(self.index), - len(self.columns)) - return final_result - - def _repr_html_(self): - """repr function for rendering in Jupyter Notebooks like Pandas - Dataframes. - - Returns: - The HTML representation of a Dataframe. - """ - # We use pandas _repr_html_ to get a string of the HTML representation - # of the dataframe. - if len(self._row_metadata) <= 60 and \ - len(self._col_metadata) <= 20: - return self._repr_helper_()._repr_html_() - # We split so that we insert our correct dataframe dimensions. - result = self._repr_helper_()._repr_html_() - return result.split("

")[0] + \ - "

{0} rows x {1} columns

\n".format(len(self.index), - len(self.columns)) - - def _get_index(self): - """Get the index for this DataFrame. - - Returns: - The union of all indexes across the partitions. - """ - return self._row_metadata.index - - def _set_index(self, new_index): - """Set the index for this DataFrame. - - Args: - new_index: The new index to set this - """ - self._row_metadata.index = new_index - - index = property(_get_index, _set_index) - - def _get_columns(self): - """Get the columns for this DataFrame. - - Returns: - The union of all indexes across the partitions. - """ - return self._col_metadata.index - - def _set_columns(self, new_index): - """Set the columns for this DataFrame. - - Args: - new_index: The new index to set this - """ - self._col_metadata.index = new_index - - columns = property(_get_columns, _set_columns) - - def _arithmetic_helper(self, remote_func, axis, level=None): - # TODO: We don't support `level` right now - if level is not None: - raise NotImplementedError("Level not yet supported.") - - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \ - else 0 - - oid_series = ray.get(_map_partitions(remote_func, - self._col_partitions if axis == 0 - else self._row_partitions)) - - if axis == 0: - # We use the index to get the internal index. - oid_series = [(oid_series[i], i) for i in range(len(oid_series))] - - if len(oid_series) > 0: - for df, partition in oid_series: - this_partition = \ - self._col_metadata.partition_series(partition) - df.index = \ - this_partition[this_partition.isin(df.index)].index - - result_series = pandas.concat([obj[0] for obj in oid_series], - axis=0, copy=False) - else: - result_series = pandas.concat(oid_series, axis=0, copy=False) - result_series.index = self.index - return result_series - - def _validate_eval_query(self, expr, **kwargs): - """Helper function to check the arguments to eval() and query() - - Args: - expr: The expression to evaluate. This string cannot contain any - Python statements, only Python expressions. - """ - if isinstance(expr, str) and expr is '': - raise ValueError("expr cannot be an empty string") - - if isinstance(expr, str) and '@' in expr: - raise NotImplementedError("Local variables not yet supported in " - "eval.") - - if isinstance(expr, str) and 'not' in expr: - if 'parser' in kwargs and kwargs['parser'] == 'python': - raise NotImplementedError("'Not' nodes are not implemented.") - - @property - def size(self): - """Get the number of elements in the DataFrame. - - Returns: - The number of elements in the DataFrame. - """ - return len(self.index) * len(self.columns) - - @property - def ndim(self): - """Get the number of dimensions for this DataFrame. - - Returns: - The number of dimensions for this DataFrame. - """ - # The number of dimensions is common across all partitions. - # The first partition will be enough. - return ray.get(_deploy_func.remote(lambda df: df.ndim, - self._row_partitions[0])) - - @property - def ftypes(self): - """Get the ftypes for this DataFrame. - - Returns: - The ftypes for this DataFrame. - """ - # The ftypes are common across all partitions. - # The first partition will be enough. - result = ray.get(_deploy_func.remote(lambda df: df.ftypes, - self._row_partitions[0])) - result.index = self.columns - return result - - def _get_remote_dtypes(self): - """Finds and caches ObjectIDs for the dtypes of each column partition. - """ - self._dtypes_cache = [_compile_remote_dtypes.remote(*column) - for column in self._block_partitions.T] - - @property - def dtypes(self): - """Get the dtypes for this DataFrame. - - Returns: - The dtypes for this DataFrame. - """ - assert self._dtypes_cache is not None - - if isinstance(self._dtypes_cache, list) and \ - isinstance(self._dtypes_cache[0], - ray.ObjectID): - self._dtypes_cache = pandas.concat(ray.get(self._dtypes_cache), - copy=False) - self._dtypes_cache.index = self.columns - - return self._dtypes_cache - - @property - def empty(self): - """Determines if the DataFrame is empty. - - Returns: - True if the DataFrame is empty. - False otherwise. - """ - return len(self.columns) == 0 or len(self.index) == 0 - - @property - def values(self): - """Create a numpy array with the values from this DataFrame. - - Returns: - The numpy representation of this DataFrame. - """ - return np.concatenate(ray.get(_map_partitions( - lambda df: df.values, self._row_partitions))) - - @property - def axes(self): - """Get the axes for the DataFrame. - - Returns: - The axes for the DataFrame. - """ - return [self.index, self.columns] - - @property - def shape(self): - """Get the size of each of the dimensions in the DataFrame. - - Returns: - A tuple with the size of each dimension as they appear in axes(). - """ - return len(self.index), len(self.columns) - - def _update_inplace(self, row_partitions=None, col_partitions=None, - block_partitions=None, columns=None, index=None, - col_metadata=None, row_metadata=None): - """updates the current DataFrame inplace. - - Behavior should be similar to the constructor, given the corresponding - arguments. Note that len(columns) and len(index) should match the - corresponding dimensions in the partition(s) passed in, otherwise this - function will complain. - - Args: - row_partitions ([ObjectID]): - The new partitions to replace self._row_partitions directly - col_partitions ([ObjectID]): - The new partitions to replace self._col_partitions directly - columns (pandas.Index): - Index of the column dimension to replace existing columns - index (pandas.Index): - Index of the row dimension to replace existing index - - Note: - If `columns` or `index` are not supplied, they will revert to - default columns or index respectively, as this function does - not have enough contextual info to rebuild the indexes - correctly based on the addition/subtraction of rows/columns. - """ - assert row_partitions is not None or col_partitions is not None\ - or block_partitions is not None, \ - "To update inplace, new column or row partitions must be set." - - if block_partitions is not None: - self._block_partitions = block_partitions - - elif row_partitions is not None: - self._row_partitions = row_partitions - - elif col_partitions is not None: - self._col_partitions = col_partitions - - if col_metadata is not None: - self._col_metadata = col_metadata - else: - assert columns is not None, \ - "If col_metadata is None, columns must be passed in" - self._col_metadata = _IndexMetadata( - self._block_partitions[0, :], index=columns, axis=1) - if row_metadata is not None: - self._row_metadata = row_metadata - else: - # Index can be None for default index, so we don't check - self._row_metadata = _IndexMetadata( - self._block_partitions[:, 0], index=index, axis=0) - - # Update dtypes - self._get_remote_dtypes() - - def add_prefix(self, prefix): - """Add a prefix to each of the column names. - - Returns: - A new DataFrame containing the new column names. - """ - new_cols = self.columns.map(lambda x: str(prefix) + str(x)) - return DataFrame(block_partitions=self._block_partitions, - columns=new_cols, - col_metadata=self._col_metadata, - row_metadata=self._row_metadata, - dtypes_cache=self._dtypes_cache) - - def add_suffix(self, suffix): - """Add a suffix to each of the column names. - - Returns: - A new DataFrame containing the new column names. - """ - new_cols = self.columns.map(lambda x: str(x) + str(suffix)) - return DataFrame(block_partitions=self._block_partitions, - columns=new_cols, - col_metadata=self._col_metadata, - row_metadata=self._row_metadata, - dtypes_cache=self._dtypes_cache) - - def applymap(self, func): - """Apply a function to a DataFrame elementwise. - - Args: - func (callable): The function to apply. - """ - if not callable(func): - raise ValueError( - "\'{0}\' object is not callable".format(type(func))) - - new_block_partitions = np.array([ - _map_partitions(lambda df: df.applymap(func), block) - for block in self._block_partitions]) - - return DataFrame(block_partitions=new_block_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata) - - def copy(self, deep=True): - """Creates a shallow copy of the DataFrame. - - Returns: - A new DataFrame pointing to the same partitions as this one. - """ - return DataFrame(block_partitions=self._block_partitions, - columns=self.columns, - index=self.index, - dtypes_cache=self._dtypes_cache) - - def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, **kwargs): - """Apply a groupby to this DataFrame. See _groupby() remote task. - Args: - by: The value to groupby. - axis: The axis to groupby. - level: The level of the groupby. - as_index: Whether or not to store result as index. - sort: Whether or not to sort the result by the index. - group_keys: Whether or not to group the keys. - squeeze: Whether or not to squeeze. - Returns: - A new DataFrame resulting from the groupby. - """ - axis = pandas.DataFrame()._get_axis_number(axis) - if callable(by): - by = by(self.index) - elif isinstance(by, compat.string_types): - by = self.__getitem__(by).values.tolist() - elif is_list_like(by): - if isinstance(by, pandas.Series): - by = by.values.tolist() - - mismatch = len(by) != len(self) if axis == 0 \ - else len(by) != len(self.columns) - - if all(obj in self for obj in by) and mismatch: - raise NotImplementedError( - "Groupby with lists of columns not yet supported.") - elif mismatch: - raise KeyError(next(x for x in by if x not in self)) - - from .groupby import DataFrameGroupBy - return DataFrameGroupBy(self, by, axis, level, as_index, sort, - group_keys, squeeze, **kwargs) - - def sum(self, axis=None, skipna=True, level=None, numeric_only=None, - min_count=1, **kwargs): - """Perform a sum across the DataFrame. - - Args: - axis (int): The axis to sum on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The sum of the DataFrame. - """ - def remote_func(df): - return df.sum(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, min_count=min_count, - **kwargs) - - return self._arithmetic_helper(remote_func, axis, level) - - def abs(self): - """Apply an absolute value function to all numeric columns. - - Returns: - A new DataFrame with the applied absolute value. - """ - for t in self.dtypes: - if np.dtype('O') == t: - # TODO Give a more accurate error to Pandas - raise TypeError("bad operand type for abs():", "str") - - new_block_partitions = np.array([_map_partitions(lambda df: df.abs(), - block) - for block in self._block_partitions]) - - return DataFrame(block_partitions=new_block_partitions, - columns=self.columns, - index=self.index, - dtypes_cache=self._dtypes_cache) - - def isin(self, values): - """Fill a DataFrame with booleans for cells contained in values. - - Args: - values (iterable, DataFrame, Series, or dict): The values to find. - - Returns: - A new DataFrame with booleans representing whether or not a cell - is in values. - True: cell is contained in values. - False: otherwise - """ - new_block_partitions = np.array([_map_partitions( - lambda df: df.isin(values), block) - for block in self._block_partitions]) - - return DataFrame(block_partitions=new_block_partitions, - columns=self.columns, - index=self.index) - - def isna(self): - """Fill a DataFrame with booleans for cells containing NA. - - Returns: - A new DataFrame with booleans representing whether or not a cell - is NA. - True: cell contains NA. - False: otherwise. - """ - new_block_partitions = np.array([_map_partitions( - lambda df: df.isna(), block) for block in self._block_partitions]) - - new_dtypes = pandas.Series([np.dtype("bool")] * len(self.columns), - index=self.columns) - - return DataFrame(block_partitions=new_block_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata, - dtypes_cache=new_dtypes) - - def isnull(self): - """Fill a DataFrame with booleans for cells containing a null value. - - Returns: - A new DataFrame with booleans representing whether or not a cell - is null. - True: cell contains null. - False: otherwise. - """ - new_block_partitions = np.array([_map_partitions( - lambda df: df.isnull(), block) - for block in self._block_partitions]) - - new_dtypes = pandas.Series([np.dtype("bool")] * len(self.columns), - index=self.columns) - - return DataFrame(block_partitions=new_block_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata, - dtypes_cache=new_dtypes) - - def keys(self): - """Get the info axis for the DataFrame. - - Returns: - A pandas Index for this DataFrame. - """ - # Each partition should have the same index, so we'll use 0's - return self.columns - - def transpose(self, *args, **kwargs): - """Transpose columns and rows for the DataFrame. - - Returns: - A new DataFrame transposed from this DataFrame. - """ - new_block_partitions = np.array([_map_partitions( - lambda df: df.T, block) for block in self._block_partitions]) - - return DataFrame(block_partitions=new_block_partitions.T, - columns=self.index, - index=self.columns) - - T = property(transpose) - - def dropna(self, axis=0, how='any', thresh=None, subset=None, - inplace=False): - """Create a new DataFrame from the removed NA values from this one. - - Args: - axis (int, tuple, or list): The axis to apply the drop. - how (str): How to drop the NA values. - 'all': drop the label if all values are NA. - 'any': drop the label if any values are NA. - thresh (int): The minimum number of NAs to require. - subset ([label]): Labels to consider from other axis. - inplace (bool): Change this DataFrame or return a new DataFrame. - True: Modify the data for this DataFrame, return None. - False: Create a new DataFrame and return it. - - Returns: - If inplace is set to True, returns None, otherwise returns a new - DataFrame with the dropna applied. - """ - inplace = validate_bool_kwarg(inplace, "inplace") - - if is_list_like(axis): - axis = [pandas.DataFrame()._get_axis_number(ax) for ax in axis] - - result = self - # TODO(kunalgosar): this builds an intermediate dataframe, - # which does unnecessary computation - for ax in axis: - result = result.dropna( - axis=ax, how=how, thresh=thresh, subset=subset) - if not inplace: - return result - - self._update_inplace(block_partitions=result._block_partitions, - columns=result.columns, - index=result.index) - - return None - - axis = pandas.DataFrame()._get_axis_number(axis) - - if how is not None and how not in ['any', 'all']: - raise ValueError('invalid how option: %s' % how) - if how is None and thresh is None: - raise TypeError('must specify how or thresh') - - indices = None - if subset is not None: - if axis == 1: - indices = self.index.get_indexer_for(subset) - check = indices == -1 - if check.any(): - raise KeyError(list(np.compress(check, subset))) - else: - indices = self.columns.get_indexer_for(subset) - check = indices == -1 - if check.any(): - raise KeyError(list(np.compress(check, subset))) - - def dropna_helper(df): - new_df = df.dropna(axis=axis, how=how, thresh=thresh, - subset=indices, inplace=False) - - if axis == 1: - new_index = new_df.columns - new_df.columns = pandas.RangeIndex(0, len(new_df.columns)) - else: - new_index = new_df.index - new_df.reset_index(drop=True, inplace=True) - - return new_df, new_index - - parts = self._col_partitions if axis == 1 else self._row_partitions - result = [_deploy_func._submit(args=(dropna_helper, df), - num_return_vals=2) for df in parts] - new_parts, new_vals = [list(t) for t in zip(*result)] - - if axis == 1: - new_vals = [self._col_metadata.get_global_indices(i, vals) - for i, vals in enumerate(ray.get(new_vals))] - - # This flattens the 2d array to 1d - new_vals = [i for j in new_vals for i in j] - new_cols = self.columns[new_vals] - - if not inplace: - return DataFrame(col_partitions=new_parts, - columns=new_cols, - index=self.index) - - self._update_inplace(col_partitions=new_parts, - columns=new_cols, - index=self.index) - - else: - new_vals = [self._row_metadata.get_global_indices(i, vals) - for i, vals in enumerate(ray.get(new_vals))] - - # This flattens the 2d array to 1d - new_vals = [i for j in new_vals for i in j] - new_rows = self.index[new_vals] - - if not inplace: - return DataFrame(row_partitions=new_parts, - index=new_rows, - columns=self.columns) - - self._update_inplace(row_partitions=new_parts, - index=new_rows, - columns=self.columns) - - return None - - def add(self, other, axis='columns', level=None, fill_value=None): - """Add this DataFrame to another or a scalar/list. - - Args: - other: What to add this this DataFrame. - axis: The axis to apply addition over. Only applicaable to Series - or list 'other'. - level: A level in the multilevel axis to add over. - fill_value: The value to fill NaN. - - Returns: - A new DataFrame with the applied addition. - """ - return self._operator_helper(pandas.DataFrame.add, other, axis, level, - fill_value) - - def agg(self, func, axis=0, *args, **kwargs): - return self.aggregate(func, axis, *args, **kwargs) - - def aggregate(self, func, axis=0, *args, **kwargs): - axis = pandas.DataFrame()._get_axis_number(axis) - - result = None - - if axis == 0: - try: - result = self._aggregate(func, axis=axis, *args, **kwargs) - except TypeError: - pass - - if result is None: - kwargs.pop('is_transform', None) - return self.apply(func, axis=axis, args=args, **kwargs) - - return result - - def _aggregate(self, arg, *args, **kwargs): - _axis = kwargs.pop('_axis', None) - if _axis is None: - _axis = getattr(self, 'axis', 0) - kwargs.pop('_level', None) - - if isinstance(arg, compat.string_types): - return self._string_function(arg, *args, **kwargs) - - # Dictionaries have complex behavior because they can be renamed here. - elif isinstance(arg, dict): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - elif is_list_like(arg): - return self.apply(arg, axis=_axis, args=args, **kwargs) - elif callable(arg): - self._callable_function(arg, _axis, *args, **kwargs) - else: - # TODO Make pandas error - raise ValueError("type {} is not callable".format(type(arg))) - - def _string_function(self, func, *args, **kwargs): - assert isinstance(func, compat.string_types) - - f = getattr(self, func, None) - - if f is not None: - if callable(f): - return f(*args, **kwargs) - - assert len(args) == 0 - assert len([kwarg - for kwarg in kwargs - if kwarg not in ['axis', '_level']]) == 0 - return f - - f = getattr(np, func, None) - if f is not None: - raise NotImplementedError("Numpy aggregates not yet supported.") - - raise ValueError("{} is an unknown string function".format(func)) - - def _callable_function(self, func, axis, *args, **kwargs): - kwargs['axis'] = axis - - def agg_helper(df, arg, index, columns, *args, **kwargs): - df.index = index - df.columns = columns - is_transform = kwargs.pop('is_transform', False) - new_df = df.agg(arg, *args, **kwargs) - - is_series = False - index = None - columns = None - - if isinstance(new_df, pandas.Series): - is_series = True - else: - columns = new_df.columns - index = new_df.index - new_df.columns = pandas.RangeIndex(0, len(new_df.columns)) - new_df.reset_index(drop=True, inplace=True) - - if is_transform: - if is_scalar(new_df) or len(new_df) != len(df): - raise ValueError("transforms cannot produce " - "aggregated results") - - return is_series, new_df, index, columns - - if axis == 0: - index = self.index - columns = [self._col_metadata.partition_series(i).index - for i in range(len(self._col_partitions))] - - remote_result = \ - [_deploy_func._submit(args=( - lambda df: agg_helper(df, - func, - index, - cols, - *args, - **kwargs), - part), num_return_vals=4) - for cols, part in zip(columns, self._col_partitions)] - - if axis == 1: - indexes = [self._row_metadata.partition_series(i).index - for i in range(len(self._row_partitions))] - columns = self.columns - - remote_result = \ - [_deploy_func._submit(args=( - lambda df: agg_helper(df, - func, - index, - columns, - *args, - **kwargs), - part), num_return_vals=4) - for index, part in zip(indexes, self._row_partitions)] - - # This magic transposes the list comprehension returned from remote - is_series, new_parts, index, columns = \ - [list(t) for t in zip(*remote_result)] - - # This part is because agg can allow returning a Series or a - # DataFrame, and we have to determine which here. Shouldn't add - # too much to latency in either case because the booleans can - # be returned immediately - is_series = ray.get(is_series) - if all(is_series): - new_series = pandas.concat(ray.get(new_parts), copy=False) - new_series.index = self.columns if axis == 0 else self.index - return new_series - # This error is thrown when some of the partitions return Series and - # others return DataFrames. We do not allow mixed returns. - elif any(is_series): - raise ValueError("no results.") - # The remaining logic executes when we have only DataFrames in the - # remote objects. We build a Ray DataFrame from the Pandas partitions. - elif axis == 0: - new_index = ray.get(index[0]) - # This does not handle the Multi Index case - new_columns = ray.get(columns) - new_columns = new_columns[0].append(new_columns[1:]) - - return DataFrame(col_partitions=new_parts, - columns=new_columns, - index=new_index) - else: - new_columns = ray.get(columns[0]) - # This does not handle the Multi Index case - new_index = ray.get(index) - new_index = new_index[0].append(new_index[1:]) - - return DataFrame(row_partitions=new_parts, - columns=new_columns, - index=new_index) - - def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, - broadcast_axis=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def all(self, axis=None, bool_only=None, skipna=None, level=None, - **kwargs): - """Return whether all elements are True over requested axis - - Note: - If axis=None or axis=0, this call applies df.all(axis=1) - to the transpose of df. - """ - def remote_func(df): - return df.all(axis=axis, bool_only=bool_only, skipna=skipna, - level=level, **kwargs) - - return self._arithmetic_helper(remote_func, axis, level) - - def any(self, axis=None, bool_only=None, skipna=None, level=None, - **kwargs): - """Return whether any elements are True over requested axis - - Note: - If axis=None or axis=0, this call applies on the column partitions, - otherwise operates on row partitions - """ - def remote_func(df): - return df.any(axis=axis, bool_only=bool_only, skipna=skipna, - level=level, **kwargs) - - return self._arithmetic_helper(remote_func, axis, level) - - def append(self, other, ignore_index=False, verify_integrity=False): - """Append another DataFrame/list/Series to this one. - - Args: - other: The object to append to this. - ignore_index: Ignore the index on appending. - verify_integrity: Verify the integrity of the index on completion. - - Returns: - A new DataFrame containing the concatenated values. - """ - if isinstance(other, (pandas.Series, dict)): - if isinstance(other, dict): - other = pandas.Series(other) - if other.name is None and not ignore_index: - raise TypeError('Can only append a Series if ignore_index=True' - ' or if the Series has a name') - - if other.name is None: - index = None - else: - # other must have the same index name as self, otherwise - # index name will be reset - index = pandas.Index([other.name], name=self.index.name) - - combined_columns = self.columns.tolist() + self.columns.union( - other.index).difference(self.columns).tolist() - other = other.reindex(combined_columns, copy=False) - other = pandas.DataFrame(other.values.reshape((1, len(other))), - index=index, - columns=combined_columns) - other = other._convert(datetime=True, timedelta=True) - elif isinstance(other, list) and not isinstance(other[0], DataFrame): - other = pandas.DataFrame(other) - if (self.columns.get_indexer(other.columns) >= 0).all(): - other = other.loc[:, self.columns] - - from .concat import concat - if isinstance(other, (list, tuple)): - to_concat = [self] + other - else: - to_concat = [self, other] - - return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity) - - def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, - args=(), **kwds): - """Apply a function along input axis of DataFrame. - - Args: - func: The function to apply - axis: The axis over which to apply the func. - broadcast: Whether or not to broadcast. - raw: Whether or not to convert to a Series. - reduce: Whether or not to try to apply reduction procedures. - - Returns: - Series or DataFrame, depending on func. - """ - axis = pandas.DataFrame()._get_axis_number(axis) - - if isinstance(func, compat.string_types): - if axis == 1: - kwds['axis'] = axis - return getattr(self, func)(*args, **kwds) - elif isinstance(func, dict): - if axis == 1: - raise TypeError( - "(\"'dict' object is not callable\", " - "'occurred at index {0}'".format(self.index[0])) - if len(self.columns) != len(set(self.columns)): - warnings.warn( - 'duplicate column names not supported with apply().', - FutureWarning, stacklevel=2) - has_list = list in map(type, func.values()) - part_ind_tuples = [(self._col_metadata[key], key) for key in func] - - if has_list: - # if input dict has a list, the function to apply must wrap - # single functions in lists as well to get the desired output - # format - result = [_deploy_func.remote( - lambda df: df.iloc[:, ind].apply( - func[key] if is_list_like(func[key]) - else [func[key]]), - self._col_partitions[part]) - for (part, ind), key in part_ind_tuples] - return pandas.concat(ray.get(result), axis=1, copy=False) - else: - result = [_deploy_func.remote( - lambda df: df.iloc[:, ind].apply(func[key]), - self._col_partitions[part]) - for (part, ind), key in part_ind_tuples] - return pandas.Series(ray.get(result), index=func.keys()) - - elif is_list_like(func): - if axis == 1: - raise TypeError( - "(\"'list' object is not callable\", " - "'occurred at index {0}'".format(self.index[0])) - # TODO: some checking on functions that return Series or Dataframe - new_cols = _map_partitions(lambda df: df.apply(func), - self._col_partitions) - - # resolve function names for the DataFrame index - new_index = [f_name if isinstance(f_name, compat.string_types) - else f_name.__name__ for f_name in func] - return DataFrame(col_partitions=new_cols, - columns=self.columns, - index=new_index, - col_metadata=self._col_metadata) - elif callable(func): - return self._callable_function(func, axis=axis, *args, **kwds) - - def as_blocks(self, copy=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def as_matrix(self, columns=None): - """Convert the frame to its Numpy-array representation. - - Args: - columns: If None, return all columns, otherwise, - returns specified columns. - - Returns: - values: ndarray - """ - # TODO this is very inefficient, also see __array__ - return to_pandas(self).as_matrix(columns) - - def asfreq(self, freq, method=None, how=None, normalize=False, - fill_value=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def asof(self, where, subset=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def assign(self, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def astype(self, dtype, copy=True, errors='raise', **kwargs): - if isinstance(dtype, dict): - if (not set(dtype.keys()).issubset(set(self.columns)) and - errors == 'raise'): - raise KeyError( - "Only a column name can be used for the key in" - "a dtype mappings argument.") - columns = list(dtype.keys()) - col_idx = [(self.columns.get_loc(columns[i]), columns[i]) - if columns[i] in self.columns - else (columns[i], columns[i]) - for i in range(len(columns))] - new_dict = {} - for idx, key in col_idx: - new_dict[idx] = dtype[key] - new_rows = _map_partitions(lambda df, dt: df.astype(dtype=dt, - copy=True, - errors=errors, - **kwargs), - self._row_partitions, new_dict) - if copy: - return DataFrame(row_partitions=new_rows, - columns=self.columns, - index=self.index) - self._row_partitions = new_rows - else: - new_blocks = [_map_partitions(lambda d: d.astype(dtype=dtype, - copy=True, - errors=errors, - **kwargs), - block) - for block in self._block_partitions] - if copy: - return DataFrame(block_partitions=new_blocks, - columns=self.columns, - index=self.index) - self._block_partitions = new_blocks - - def at_time(self, time, asof=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def between_time(self, start_time, end_time, include_start=True, - include_end=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def bfill(self, axis=None, inplace=False, limit=None, downcast=None): - """Synonym for DataFrame.fillna(method='bfill') - """ - new_df = self.fillna(method='bfill', - axis=axis, - limit=limit, - downcast=downcast, - inplace=inplace) - if not inplace: - return new_df - - def bool(self): - """Return the bool of a single element PandasObject. - - This must be a boolean scalar value, either True or False. Raise a - ValueError if the PandasObject does not have exactly 1 element, or that - element is not boolean - """ - shape = self.shape - if shape != (1,) and shape != (1, 1): - raise ValueError("""The PandasObject does not have exactly - 1 element. Return the bool of a single - element PandasObject. The truth value is - ambiguous. Use a.empty, a.item(), a.any() - or a.all().""") - else: - return to_pandas(self).bool() - - def boxplot(self, column=None, by=None, ax=None, fontsize=None, rot=0, - grid=True, figsize=None, layout=None, return_type=None, - **kwds): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, - **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def clip_lower(self, threshold, axis=None, inplace=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def clip_upper(self, threshold, axis=None, inplace=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def combine(self, other, func, fill_value=None, overwrite=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def combine_first(self, other): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def compound(self, axis=None, skipna=None, level=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def consolidate(self, inplace=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def convert_objects(self, convert_dates=True, convert_numeric=False, - convert_timedeltas=True, copy=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def corr(self, method='pearson', min_periods=1): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def corrwith(self, other, axis=0, drop=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def count(self, axis=0, level=None, numeric_only=False): - """Get the count of non-null objects in the DataFrame. - - Arguments: - axis: 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - level: If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a DataFrame. - numeric_only: Include only float, int, boolean data - - Returns: - The count, in a Series (or DataFrame if level is specified). - """ - def remote_func(df): - return df.count(axis=axis, level=level, numeric_only=numeric_only) - - return self._arithmetic_helper(remote_func, axis, level) - - def cov(self, min_periods=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def _cumulative_helper(self, func, axis): - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \ - else 0 - - if axis == 0: - new_cols = _map_partitions(func, self._col_partitions) - return DataFrame(col_partitions=new_cols, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata) - else: - new_rows = _map_partitions(func, self._row_partitions) - return DataFrame(row_partitions=new_rows, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata) - - def cummax(self, axis=None, skipna=True, *args, **kwargs): - """Perform a cumulative maximum across the DataFrame. - - Args: - axis (int): The axis to take maximum on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The cumulative maximum of the DataFrame. - """ - def remote_func(df): - return df.cummax(axis=axis, skipna=skipna, *args, **kwargs) - - return self._cumulative_helper(remote_func, axis) - - def cummin(self, axis=None, skipna=True, *args, **kwargs): - """Perform a cumulative minimum across the DataFrame. - - Args: - axis (int): The axis to cummin on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The cumulative minimum of the DataFrame. - """ - def remote_func(df): - return df.cummin(axis=axis, skipna=skipna, *args, **kwargs) - - return self._cumulative_helper(remote_func, axis) - - def cumprod(self, axis=None, skipna=True, *args, **kwargs): - """Perform a cumulative product across the DataFrame. - - Args: - axis (int): The axis to take product on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The cumulative product of the DataFrame. - """ - def remote_func(df): - return df.cumprod(axis=axis, skipna=skipna, *args, **kwargs) - - return self._cumulative_helper(remote_func, axis) - - def cumsum(self, axis=None, skipna=True, *args, **kwargs): - """Perform a cumulative sum across the DataFrame. - - Args: - axis (int): The axis to take sum on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The cumulative sum of the DataFrame. - """ - def remote_func(df): - return df.cumsum(axis=axis, skipna=skipna, *args, **kwargs) - - return self._cumulative_helper(remote_func, axis) - - def describe(self, percentiles=None, include=None, exclude=None): - """ - Generates descriptive statistics that summarize the central tendency, - dispersion and shape of a dataset's distribution, excluding NaN values. - - Args: - percentiles (list-like of numbers, optional): - The percentiles to include in the output. - include: White-list of data types to include in results - exclude: Black-list of data types to exclude in results - - Returns: Series/DataFrame of summary statistics - """ - def describe_helper(df): - """This to ensure nothing goes on with non-numeric columns""" - try: - return df.select_dtypes(exclude='object').describe( - percentiles=percentiles, - include=include, - exclude=exclude) - # This exception is thrown when there are only non-numeric columns - # in this partition - except ValueError: - return pandas.DataFrame() - - # Begin fixing index based on the columns inside. - parts = ray.get(_map_partitions(describe_helper, self._col_partitions)) - # We use the index to get the internal index. - parts = [(parts[i], i) for i in range(len(parts))] - - for df, partition in parts: - this_partition = self._col_metadata.partition_series(partition) - df.columns = this_partition[this_partition.isin(df.columns)].index - - # Remove index from tuple - result = pandas.concat([obj[0] for obj in parts], axis=1, copy=False) - return result - - def diff(self, periods=1, axis=0): - """Finds the difference between elements on the axis requested - - Args: - periods: Periods to shift for forming difference - axis: Take difference over rows or columns - - Returns: - DataFrame with the diff applied - """ - axis = pandas.DataFrame()._get_axis_number(axis) - partitions = (self._col_partitions if - axis == 0 else self._row_partitions) - - result = _map_partitions(lambda df: - df.diff(axis=axis, periods=periods), - partitions) - - if (axis == 1): - return DataFrame(row_partitions=result, - columns=self.columns, - index=self.index) - if (axis == 0): - return DataFrame(col_partitions=result, - columns=self.columns, - index=self.index) - - def div(self, other, axis='columns', level=None, fill_value=None): - """Divides this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the divide against this. - axis: The axis to divide over. - level: The Multilevel index level to apply divide over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Divide applied. - """ - return self._operator_helper(pandas.DataFrame.div, other, axis, level, - fill_value) - - def divide(self, other, axis='columns', level=None, fill_value=None): - """Synonym for div. - - Args: - other: The object to use to apply the divide against this. - axis: The axis to divide over. - level: The Multilevel index level to apply divide over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Divide applied. - """ - return self.div(other, axis, level, fill_value) - - def dot(self, other): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def drop(self, labels=None, axis=0, index=None, columns=None, level=None, - inplace=False, errors='raise'): - """Return new object with labels in requested axis removed. - Args: - labels: Index or column labels to drop. - axis: Whether to drop labels from the index (0 / 'index') or - columns (1 / 'columns'). - index, columns: Alternative to specifying axis (labels, axis=1 is - equivalent to columns=labels). - level: For MultiIndex - inplace: If True, do operation inplace and return None. - errors: If 'ignore', suppress error and existing labels are - dropped. - Returns: - dropped : type of caller - """ - # TODO implement level - if level is not None: - raise NotImplementedError("Level not yet supported for drop") - - inplace = validate_bool_kwarg(inplace, "inplace") - if labels is not None: - if index is not None or columns is not None: - raise ValueError("Cannot specify both 'labels' and " - "'index'/'columns'") - axis = pandas.DataFrame()._get_axis_name(axis) - axes = {axis: labels} - elif index is not None or columns is not None: - axes, _ = pandas.DataFrame() \ - ._construct_axes_from_arguments((index, columns), {}) - else: - raise ValueError("Need to specify at least one of 'labels', " - "'index' or 'columns'") - obj = self.copy() - - def drop_helper(obj, axis, label): - # TODO(patyang): If you drop from the index first, you can do it - # in batch by returning the dropped items. Likewise coords.drop - # leaves the coords df in an inconsistent state. - if axis == 'index': - try: - coords = obj._row_metadata[label] - if isinstance(coords, pandas.DataFrame): - partitions = list(coords['partition']) - indexes = list(coords['index_within_partition']) - else: - partitions, indexes = coords - partitions = [partitions] - indexes = [indexes] - - for part, index in zip(partitions, indexes): - x = _deploy_func.remote( - lambda df: df.drop(labels=index, axis=axis, - errors='ignore'), - obj._row_partitions[part]) - obj._row_partitions = \ - [obj._row_partitions[i] if i != part - else x - for i in range(len(obj._row_partitions))] - - # The decrement here is because we're dropping one at a - # time and the index is automatically updated when we - # convert back to blocks. - obj._row_metadata.squeeze(part, index) - - obj._row_metadata.drop(labels=label) - except KeyError: - return obj - else: - try: - coords = obj._col_metadata[label] - if isinstance(coords, pandas.DataFrame): - partitions = list(coords['partition']) - indexes = list(coords['index_within_partition']) - else: - partitions, indexes = coords - partitions = [partitions] - indexes = [indexes] - - for part, index in zip(partitions, indexes): - x = _deploy_func.remote( - lambda df: df.drop(labels=index, axis=axis, - errors='ignore'), - obj._col_partitions[part]) - obj._col_partitions = \ - [obj._col_partitions[i] if i != part - else x - for i in range(len(obj._col_partitions))] - - # The decrement here is because we're dropping one at a - # time and the index is automatically updated when we - # convert back to blocks. - obj._col_metadata.squeeze(part, index) - - obj._col_metadata.drop(labels=label) - except KeyError: - return obj - - return obj - - for axis, labels in axes.items(): - if labels is None: - continue - - if is_list_like(labels): - for label in labels: - if errors != 'ignore' and label and \ - label not in getattr(self, axis): - raise ValueError("The label [{}] is not in the [{}]", - label, axis) - else: - obj = drop_helper(obj, axis, label) - else: - if errors != 'ignore' and labels and \ - labels not in getattr(self, axis): - raise ValueError("The label [{}] is not in the [{}]", - labels, axis) - else: - obj = drop_helper(obj, axis, labels) - - if not inplace: - return obj - else: - self._row_metadata = obj._row_metadata - self._col_metadata = obj._col_metadata - self._block_partitions = obj._block_partitions - - def drop_duplicates(self, subset=None, keep='first', inplace=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def duplicated(self, subset=None, keep='first'): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def eq(self, other, axis='columns', level=None): - """Checks element-wise that this is equal to other. - - Args: - other: A DataFrame or Series or scalar to compare to. - axis: The axis to perform the eq over. - level: The Multilevel index level to apply eq over. - - Returns: - A new DataFrame filled with Booleans. - """ - return self._operator_helper(pandas.DataFrame.eq, other, axis, level) - - def equals(self, other): - """ - Checks if other DataFrame is elementwise equal to the current one - - Returns: - Boolean: True if equal, otherwise False - """ - - if not self.index.equals(other.index) or not \ - self.columns.equals(other.columns): - return False - - # We copartition because we don't know what the DataFrames look like - # before this. Empty partitions can give problems with - # _match_partitioning (See _match_partitioning) - new_zipped_parts = self._copartition(other, self.index) - - equals_partitions = [_equals_helper.remote(left, right) - for left, right in new_zipped_parts] - - # To avoid getting all we use next notation. - return next((False for eq in equals_partitions if not ray.get(eq)), - True) - - def eval(self, expr, inplace=False, **kwargs): - """Evaluate a Python expression as a string using various backends. - Args: - expr: The expression to evaluate. This string cannot contain any - Python statements, only Python expressions. - - parser: The parser to use to construct the syntax tree from the - expression. The default of 'pandas' parses code slightly - different than standard Python. Alternatively, you can parse - an expression using the 'python' parser to retain strict - Python semantics. See the enhancing performance documentation - for more details. - - engine: The engine used to evaluate the expression. - - truediv: Whether to use true division, like in Python >= 3 - - local_dict: A dictionary of local variables, taken from locals() - by default. - - global_dict: A dictionary of global variables, taken from - globals() by default. - - resolvers: A list of objects implementing the __getitem__ special - method that you can use to inject an additional collection - of namespaces to use for variable lookup. For example, this is - used in the query() method to inject the index and columns - variables that refer to their respective DataFrame instance - attributes. - - level: The number of prior stack frames to traverse and add to - the current scope. Most users will not need to change this - parameter. - - target: This is the target object for assignment. It is used when - there is variable assignment in the expression. If so, then - target must support item assignment with string keys, and if a - copy is being returned, it must also support .copy(). - - inplace: If target is provided, and the expression mutates target, - whether to modify target inplace. Otherwise, return a copy of - target with the mutation. - Returns: - ndarray, numeric scalar, DataFrame, Series - """ - self._validate_eval_query(expr, **kwargs) - - columns = self.columns - - def eval_helper(df): - df.columns = columns - result = df.eval(expr, inplace=False, **kwargs) - # If result is a series, expr was not an assignment expression. - if not isinstance(result, pandas.Series): - result.columns = pandas.RangeIndex(0, len(result.columns)) - return result - - inplace = validate_bool_kwarg(inplace, "inplace") - new_rows = _map_partitions(eval_helper, self._row_partitions) - - result_type = ray.get(_deploy_func.remote(lambda df: type(df), - new_rows[0])) - if result_type is pandas.Series: - new_series = pandas.concat(ray.get(new_rows), axis=0, copy=False) - new_series.index = self.index - return new_series - - columns_copy = self._col_metadata._coord_df.copy().T - columns_copy.eval(expr, inplace=True, **kwargs) - columns = columns_copy.columns - - if inplace: - self._update_inplace(row_partitions=new_rows, columns=columns, - index=self.index) - else: - return DataFrame(columns=columns, row_partitions=new_rows) - - def ewm(self, com=None, span=None, halflife=None, alpha=None, - min_periods=0, freq=None, adjust=True, ignore_na=False, axis=0): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def expanding(self, min_periods=1, freq=None, center=False, axis=0): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def ffill(self, axis=None, inplace=False, limit=None, downcast=None): - """Synonym for DataFrame.fillna(method='ffill') - """ - new_df = self.fillna(method='ffill', - axis=axis, - limit=limit, - downcast=downcast, - inplace=inplace) - if not inplace: - return new_df - - def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None, **kwargs): - """Fill NA/NaN values using the specified method. - - Args: - value: Value to use to fill holes. This value cannot be a list. - - method: Method to use for filling holes in reindexed Series pad. - ffill: propagate last valid observation forward to next valid - backfill. - bfill: use NEXT valid observation to fill gap. - - axis: 0 or 'index', 1 or 'columns'. - - inplace: If True, fill in place. Note: this will modify any other - views on this object. - - limit: If method is specified, this is the maximum number of - consecutive NaN values to forward/backward fill. In other - words, if there is a gap with more than this number of - consecutive NaNs, it will only be partially filled. If method - is not specified, this is the maximum number of entries along - the entire axis where NaNs will be filled. Must be greater - than 0 if not None. - - downcast: A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an - appropriate equal type. - - Returns: - filled: DataFrame - """ - # TODO implement value passed as DataFrame - if isinstance(value, pandas.DataFrame): - raise NotImplementedError("Passing a DataFrame as the value for " - "fillna is not yet supported.") - - inplace = validate_bool_kwarg(inplace, 'inplace') - - axis = pandas.DataFrame()._get_axis_number(axis) \ - if axis is not None \ - else 0 - - if isinstance(value, (list, tuple)): - raise TypeError('"value" parameter must be a scalar or dict, but ' - 'you passed a "{0}"'.format(type(value).__name__)) - if value is None and method is None: - raise ValueError('must specify a fill method or value') - if value is not None and method is not None: - raise ValueError('cannot specify both a fill method and value') - if method is not None and method not in ['backfill', 'bfill', 'pad', - 'ffill']: - expecting = 'pad (ffill) or backfill (bfill)' - msg = 'Invalid fill method. Expecting {expecting}. Got {method}'\ - .format(expecting=expecting, method=method) - raise ValueError(msg) - - if inplace: - new_obj = self - else: - new_obj = self.copy() - - parts, coords_obj = (new_obj._col_partitions, - new_obj._col_metadata) if axis == 0 else \ - (new_obj._row_partitions, - new_obj._row_metadata) - - if isinstance(value, (pandas.Series, dict)): - new_vals = {} - value = dict(value) - for val in value: - # Get the local index for the partition - try: - part, index = coords_obj[val] - # Pandas ignores these errors so we will suppress them too. - except KeyError: - continue - - new_vals[val] = _deploy_func.remote(lambda df: df.fillna( - value={index: value[val]}, - method=method, - axis=axis, - inplace=False, - limit=limit, - downcast=downcast, - **kwargs), parts[part]) - - # Not every partition was changed, so we put everything back that - # was not changed and update those that were. - new_parts = [parts[i] if coords_obj.index[i] not in new_vals - else new_vals[coords_obj.index[i]] - for i in range(len(parts))] - else: - new_parts = _map_partitions(lambda df: df.fillna( - value=value, - method=method, - axis=axis, - inplace=False, - limit=limit, - downcast=downcast, - **kwargs), parts) - - if axis == 0: - new_obj._update_inplace(col_partitions=new_parts, - columns=self.columns, - index=self.index) - else: - new_obj._update_inplace(row_partitions=new_parts, - columns=self.columns, - index=self.index) - if not inplace: - return new_obj - - def filter(self, items=None, like=None, regex=None, axis=None): - """Subset rows or columns based on their labels - - Args: - items (list): list of labels to subset - like (string): retain labels where `arg in label == True` - regex (string): retain labels matching regex input - axis: axis to filter on - - Returns: - A new dataframe with the filter applied. - """ - nkw = com._count_not_none(items, like, regex) - if nkw > 1: - raise TypeError('Keyword arguments `items`, `like`, or `regex` ' - 'are mutually exclusive') - if nkw == 0: - raise TypeError('Must pass either `items`, `like`, or `regex`') - - if axis is None: - axis = 'columns' # This is the default info axis for dataframes - - axis = pandas.DataFrame()._get_axis_number(axis) - labels = self.columns if axis else self.index - - if items is not None: - bool_arr = labels.isin(items) - elif like is not None: - def f(x): - return like in to_str(x) - bool_arr = labels.map(f).tolist() - else: - def f(x): - return matcher.search(to_str(x)) is not None - matcher = re.compile(regex) - bool_arr = labels.map(f).tolist() - - if not axis: - return self[bool_arr] - return self[self.columns[bool_arr]] - - def first(self, offset): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def first_valid_index(self): - """Return index for first non-NA/null value. - - Returns: - scalar: type of index - """ - return self._row_metadata.first_valid_index() - - def floordiv(self, other, axis='columns', level=None, fill_value=None): - """Divides this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the divide against this. - axis: The axis to divide over. - level: The Multilevel index level to apply divide over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Divide applied. - """ - return self._operator_helper(pandas.DataFrame.floordiv, other, axis, - level, fill_value) - - @classmethod - def from_csv(self, path, header=0, sep=', ', index_col=0, - parse_dates=True, encoding=None, tupleize_cols=None, - infer_datetime_format=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - @classmethod - def from_dict(self, data, orient='columns', dtype=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - @classmethod - def from_items(self, items, columns=None, orient='columns'): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - @classmethod - def from_records(self, data, index=None, exclude=None, columns=None, - coerce_float=False, nrows=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def ge(self, other, axis='columns', level=None): - """Checks element-wise that this is greater than or equal to other. - - Args: - other: A DataFrame or Series or scalar to compare to. - axis: The axis to perform the gt over. - level: The Multilevel index level to apply gt over. - - Returns: - A new DataFrame filled with Booleans. - """ - return self._operator_helper(pandas.DataFrame.ge, other, axis, level) - - def get(self, key, default=None): - """Get item from object for given key (DataFrame column, Panel - slice, etc.). Returns default value if not found. - - Args: - key (DataFrame column, Panel slice) : the key for which value - to get - - Returns: - value (type of items contained in object) : A value that is - stored at the key - """ - try: - return self[key] - except (KeyError, ValueError, IndexError): - return default - - def get_dtype_counts(self): - """Get the counts of dtypes in this object. - - Returns: - The counts of dtypes in this object. - """ - return ray.get(_deploy_func.remote(lambda df: df.get_dtype_counts(), - self._row_partitions[0])) - - def get_ftype_counts(self): - """Get the counts of ftypes in this object. - - Returns: - The counts of ftypes in this object. - """ - return ray.get(_deploy_func.remote(lambda df: df.get_ftype_counts(), - self._row_partitions[0])) - - def get_value(self, index, col, takeable=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def get_values(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def gt(self, other, axis='columns', level=None): - """Checks element-wise that this is greater than other. - - Args: - other: A DataFrame or Series or scalar to compare to. - axis: The axis to perform the gt over. - level: The Multilevel index level to apply gt over. - - Returns: - A new DataFrame filled with Booleans. - """ - return self._operator_helper(pandas.DataFrame.gt, other, axis, level) - - def head(self, n=5): - """Get the first n rows of the dataframe. - - Args: - n (int): The number of rows to return. - - Returns: - A new dataframe with the first n rows of the dataframe. - """ - if n >= len(self._row_metadata): - return self.copy() - - new_dfs = _map_partitions(lambda df: df.head(n), - self._col_partitions) - - index = self._row_metadata.index[:n] - - return DataFrame(col_partitions=new_dfs, - col_metadata=self._col_metadata, - index=index, - dtypes_cache=self._dtypes_cache) - - def hist(self, data, column=None, by=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, - sharey=False, figsize=None, layout=None, bins=10, **kwds): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def idxmax(self, axis=0, skipna=True): - """Get the index of the first occurrence of the max value of the axis. - - Args: - axis (int): Identify the max over the rows (1) or columns (0). - skipna (bool): Whether or not to skip NA values. - - Returns: - A Series with the index for each maximum value for the axis - specified. - """ - if not all(d != np.dtype('O') for d in self.dtypes): - raise TypeError( - "reduction operation 'argmax' not allowed for this dtype") - - def remote_func(df): - return df.idxmax(axis=axis, skipna=skipna) - - internal_indices = self._arithmetic_helper(remote_func, axis) - # do this to convert internal indices to correct index - return internal_indices.apply(lambda x: self.index[x]) - - def idxmin(self, axis=0, skipna=True): - """Get the index of the first occurrence of the min value of the axis. - - Args: - axis (int): Identify the min over the rows (1) or columns (0). - skipna (bool): Whether or not to skip NA values. - - Returns: - A Series with the index for each minimum value for the axis - specified. - """ - if not all(d != np.dtype('O') for d in self.dtypes): - raise TypeError( - "reduction operation 'argmax' not allowed for this dtype") - - def remote_func(df): - return df.idxmin(axis=axis, skipna=skipna) - - internal_indices = self._arithmetic_helper(remote_func, axis) - # do this to convert internal indices to correct index - return internal_indices.apply(lambda x: self.index[x]) - - def infer_objects(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, - null_counts=None): - - def info_helper(df): - output_buffer = io.StringIO() - df.info(verbose=verbose, - buf=output_buffer, - max_cols=max_cols, - memory_usage=memory_usage, - null_counts=null_counts) - return output_buffer.getvalue() - - # Combine the per-partition info and split into lines - result = ''.join(ray.get(_map_partitions(info_helper, - self._col_partitions))) - lines = result.split('\n') - - # Class denoted in info() output - class_string = '\n' - - # Create the Index info() string by parsing self.index - index_string = self.index.summary() + '\n' - - # A column header is needed in the inf() output - col_header = 'Data columns (total {0} columns):\n' \ - .format(len(self.columns)) - - # Parse the per-partition values to get the per-column details - # Find all the lines in the output that start with integers - prog = re.compile('^[0-9]+.+') - col_lines = [prog.match(line) for line in lines] - cols = [c.group(0) for c in col_lines if c is not None] - # replace the partition columns names with real column names - columns = ["{0}\t{1}\n".format(self.columns[i], - cols[i].split(" ", 1)[1]) - for i in range(len(cols))] - col_string = ''.join(columns) + '\n' - - # A summary of the dtypes in the dataframe - dtypes_string = "dtypes: " - for dtype, count in self.dtypes.value_counts().iteritems(): - dtypes_string += "{0}({1}),".format(dtype, count) - dtypes_string = dtypes_string[:-1] + '\n' - - # Compute the memory usage by summing per-partitions return values - # Parse lines for memory usage number - prog = re.compile('^memory+.+') - mems = [prog.match(line) for line in lines] - mem_vals = [float(re.search(r'\d+', m.group(0)).group()) - for m in mems if m is not None] - - memory_string = "" - - if len(mem_vals) != 0: - # Sum memory usage from each partition - if memory_usage != 'deep': - memory_string = 'memory usage: {0}+ bytes' \ - .format(sum(mem_vals)) - else: - memory_string = 'memory usage: {0} bytes'.format(sum(mem_vals)) - - # Combine all the components of the info() output - result = ''.join([class_string, index_string, col_header, - col_string, dtypes_string, memory_string]) - - # Write to specified output buffer - if buf: - buf.write(result) - else: - sys.stdout.write(result) - - def insert(self, loc, column, value, allow_duplicates=False): - """Insert column into DataFrame at specified location. - - Args: - loc (int): Insertion index. Must verify 0 <= loc <= len(columns). - column (hashable object): Label of the inserted column. - value (int, Series, or array-like): The values to insert. - allow_duplicates (bool): Whether to allow duplicate column names. - """ - if not is_list_like(value): - value = np.full(len(self.index), value) - - if len(value) != len(self.index): - raise ValueError( - "Length of values does not match length of index") - if not allow_duplicates and column in self.columns: - raise ValueError( - "cannot insert {0}, already exists".format(column)) - if loc > len(self.columns): - raise IndexError( - "index {0} is out of bounds for axis 0 with size {1}".format( - loc, len(self.columns))) - if loc < 0: - raise ValueError("unbounded slice") - - partition, index_within_partition = \ - self._col_metadata.insert(column, loc) - - # Deploy insert function to specific column partition, and replace that - # column - def insert_col_part(df): - if isinstance(value, pandas.Series) and \ - isinstance(value.dtype, - pandas.core.dtypes.dtypes.DatetimeTZDtype): - # Need to set index to index of this dtype or inserted values - # become NaT - df.index = value - df.insert(index_within_partition, column, - value, allow_duplicates) - df.index = pandas.RangeIndex(0, len(df)) - else: - df.insert(index_within_partition, column, - value, allow_duplicates) - return df - - new_obj = _deploy_func.remote(insert_col_part, - self._col_partitions[partition]) - - new_cols = [self._col_partitions[i] - if i != partition - else new_obj - for i in range(len(self._col_partitions))] - new_col_names = self.columns.insert(loc, column) - - self._update_inplace(col_partitions=new_cols, columns=new_col_names, - index=self.index) - - def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', downcast=None, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def iterrows(self): - """Iterate over DataFrame rows as (index, Series) pairs. - - Note: - Generators can't be pickled so from the remote function - we expand the generator into a list before getting it. - This is not that ideal. - - Returns: - A generator that iterates over the rows of the frame. - """ - index_iter = (self._row_metadata.partition_series(i).index - for i in range(len(self._row_partitions))) - - def iterrow_helper(part): - df = ray.get(part) - df.columns = self.columns - df.index = next(index_iter) - return df.iterrows() - - partition_iterator = PartitionIterator(self._row_partitions, - iterrow_helper) - - for v in partition_iterator: - yield v - - def items(self): - """Iterator over (column name, Series) pairs. - - Note: - Generators can't be pickled so from the remote function - we expand the generator into a list before getting it. - This is not that ideal. - - Returns: - A generator that iterates over the columns of the frame. - """ - col_iter = (self._col_metadata.partition_series(i).index - for i in range(len(self._col_partitions))) - - def items_helper(part): - df = ray.get(part) - df.columns = next(col_iter) - df.index = self.index - return df.items() - - partition_iterator = PartitionIterator(self._col_partitions, - items_helper) - - for v in partition_iterator: - yield v - - def iteritems(self): - """Iterator over (column name, Series) pairs. - - Note: - Returns the same thing as .items() - - Returns: - A generator that iterates over the columns of the frame. - """ - return self.items() - - def itertuples(self, index=True, name='Pandas'): - """Iterate over DataFrame rows as namedtuples. - - Args: - index (boolean, default True): If True, return the index as the - first element of the tuple. - name (string, default "Pandas"): The name of the returned - namedtuples or None to return regular tuples. - Note: - Generators can't be pickled so from the remote function - we expand the generator into a list before getting it. - This is not that ideal. - - Returns: - A tuple representing row data. See args for varying tuples. - """ - index_iter = (self._row_metadata.partition_series(i).index - for i in range(len(self._row_partitions))) - - def itertuples_helper(part): - df = ray.get(part) - df.columns = self.columns - df.index = next(index_iter) - return df.itertuples(index=index, name=name) - - partition_iterator = PartitionIterator(self._row_partitions, - itertuples_helper) - - for v in partition_iterator: - yield v - - def join(self, other, on=None, how='left', lsuffix='', rsuffix='', - sort=False): - """Join two or more DataFrames, or a DataFrame with a collection. - - Args: - other: What to join this DataFrame with. - on: A column name to use from the left for the join. - how: What type of join to conduct. - lsuffix: The suffix to add to column names that match on left. - rsuffix: The suffix to add to column names that match on right. - sort: Whether or not to sort. - - Returns: - The joined DataFrame. - """ - - if on is not None: - raise NotImplementedError("Not yet.") - - if isinstance(other, pandas.Series): - if other.name is None: - raise ValueError("Other Series must have a name") - other = DataFrame({other.name: other}) - - if isinstance(other, DataFrame): - if on is not None: - index = self[on] - else: - index = self.index - - new_index = index.join(other.index, how=how, sort=sort) - - # Joining two empty DataFrames is fast, and error checks for us. - new_column_labels = pandas.DataFrame(columns=self.columns) \ - .join(pandas.DataFrame(columns=other.columns), - lsuffix=lsuffix, rsuffix=rsuffix).columns - - new_partition_num = max(len(self._block_partitions.T), - len(other._block_partitions.T)) - - # Join is a concat once we have shuffled the data internally. - # We shuffle the data by computing the correct order. - # Another important thing to note: We set the current self index - # to the index variable which may be 'on'. - new_self = np.array([ - _reindex_helper._submit(args=tuple([index, new_index, 1, - new_partition_num] + - block.tolist()), - num_return_vals=new_partition_num) - for block in self._block_partitions.T]) - new_other = np.array([ - _reindex_helper._submit(args=tuple([other.index, new_index, 1, - new_partition_num] + - block.tolist()), - num_return_vals=new_partition_num) - for block in other._block_partitions.T]) - - # Append the blocks together (i.e. concat) - new_block_parts = np.concatenate((new_self, new_other)).T - - # Default index in the case that on is set. - if on is not None: - new_index = None - - # TODO join the two metadata tables for performance. - return DataFrame(block_partitions=new_block_parts, - index=new_index, - columns=new_column_labels) - else: - # This constraint carried over from Pandas. - if on is not None: - raise ValueError("Joining multiple DataFrames only supported" - " for joining on index") - - # Joining the empty DataFrames with either index or columns is - # fast. It gives us proper error checking for the edge cases that - # would otherwise require a lot more logic. - new_index = pandas.DataFrame(index=self.index).join( - [pandas.DataFrame(index=obj.index) for obj in other], - how=how, sort=sort).index - - new_column_labels = pandas.DataFrame(columns=self.columns).join( - [pandas.DataFrame(columns=obj.columns) for obj in other], - lsuffix=lsuffix, rsuffix=rsuffix).columns - - new_partition_num = max([len(self._block_partitions.T)] + - [len(obj._block_partitions.T) - for obj in other]) - - new_self = np.array([ - _reindex_helper._submit(args=tuple([self.index, new_index, 1, - new_partition_num] + - block.tolist()), - num_return_vals=new_partition_num) - for block in self._block_partitions.T]) - - new_others = np.array([_reindex_helper._submit( - args=tuple([obj.index, new_index, 1, new_partition_num] + - block.tolist()), - num_return_vals=new_partition_num - ) for obj in other for block in obj._block_partitions.T]) - - # Append the columns together (i.e. concat) - new_block_parts = np.concatenate((new_self, new_others)).T - - # TODO join the two metadata tables for performance. - return DataFrame(block_partitions=new_block_parts, - index=new_index, - columns=new_column_labels) - - def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def last(self, offset): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def last_valid_index(self): - """Return index for last non-NA/null value. - - Returns: - scalar: type of index - """ - return self._row_metadata.last_valid_index() - - def le(self, other, axis='columns', level=None): - """Checks element-wise that this is less than or equal to other. - - Args: - other: A DataFrame or Series or scalar to compare to. - axis: The axis to perform the le over. - level: The Multilevel index level to apply le over. - - Returns: - A new DataFrame filled with Booleans. - """ - return self._operator_helper(pandas.DataFrame.le, other, axis, level) - - def lookup(self, row_labels, col_labels): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def lt(self, other, axis='columns', level=None): - """Checks element-wise that this is less than other. - - Args: - other: A DataFrame or Series or scalar to compare to. - axis: The axis to perform the lt over. - level: The Multilevel index level to apply lt over. - - Returns: - A new DataFrame filled with Booleans. - """ - return self._operator_helper(pandas.DataFrame.lt, other, axis, level) - - def mad(self, axis=None, skipna=None, level=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, - errors='raise', try_cast=False, raise_on_error=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def max(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - """Perform max across the DataFrame. - - Args: - axis (int): The axis to take the max on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The max of the DataFrame. - """ - def remote_func(df): - return df.max(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, **kwargs) - - return self._arithmetic_helper(remote_func, axis, level) - - def mean(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - """Computes mean across the DataFrame. - - Args: - axis (int): The axis to take the mean on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The mean of the DataFrame. (Pandas series) - """ - def remote_func(df): - return df.mean(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, **kwargs) - - return self._arithmetic_helper(remote_func, axis, level) - - def median(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - """Computes median across the DataFrame. - - Args: - axis (int): The axis to take the median on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The median of the DataFrame. (Pandas series) - """ - def remote_func(df): - return df.median(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, **kwargs) - - return self._arithmetic_helper(remote_func, axis, level) - - def melt(self, id_vars=None, value_vars=None, var_name=None, - value_name='value', col_level=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def memory_usage(self, index=True, deep=False): - - def remote_func(df): - return df.memory_usage(index=False, deep=deep) - - result = self._arithmetic_helper(remote_func, axis=0) - - result.index = self.columns - if index: - index_value = self._row_metadata.index.memory_usage(deep=deep) - return pandas.Series(index_value, index=['Index']).append(result) - - return result - - def merge(self, right, how='inner', on=None, left_on=None, right_on=None, - left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True, indicator=False, - validate=None): - """Database style join, where common columns in "on" are merged. - - Args: - right: The DataFrame to merge against. - how: What type of join to use. - on: The common column name(s) to join on. If None, and left_on and - right_on are also None, will default to all commonly named - columns. - left_on: The column(s) on the left to use for the join. - right_on: The column(s) on the right to use for the join. - left_index: Use the index from the left as the join keys. - right_index: Use the index from the right as the join keys. - sort: Sort the join keys lexicographically in the result. - suffixes: Add this suffix to the common names not in the "on". - copy: Does nothing in our implementation - indicator: Adds a column named _merge to the DataFrame with - metadata from the merge about each row. - validate: Checks if merge is a specific type. - - Returns: - A merged Dataframe - """ - - if not isinstance(right, DataFrame): - raise ValueError("can not merge DataFrame with instance of type " - "{}".format(type(right))) - - args = (how, on, left_on, right_on, left_index, right_index, sort, - suffixes, False, indicator, validate) - - left_cols = ray.put(self.columns) - right_cols = ray.put(right.columns) - - # This can be put in a remote function because we don't need it until - # the end, and the columns can be built asynchronously. This takes the - # columns defining off the critical path and speeds up the overall - # merge. - new_columns = _merge_columns.remote(left_cols, right_cols, *args) - - if on is not None: - if left_on is not None or right_on is not None: - raise MergeError("Can only pass argument \"on\" OR \"left_on\"" - " and \"right_on\", not a combination of " - "both.") - if not is_list_like(on): - on = [on] - - if next((True for key in on if key not in self), False) or \ - next((True for key in on if key not in right), False): - - missing_key = \ - next((str(key) for key in on if key not in self), "") + \ - next((str(key) for key in on if key not in right), "") - raise KeyError(missing_key) - - elif right_on is not None or right_index is True: - if left_on is None and left_index is False: - # Note: This is not the same error as pandas, but pandas throws - # a ValueError NoneType has no len(), and I don't think that - # helps enough. - raise TypeError("left_on must be specified or left_index must " - "be true if right_on is specified.") - - elif left_on is not None or left_index is True: - if right_on is None and right_index is False: - # Note: See note above about TypeError. - raise TypeError("right_on must be specified or right_index " - "must be true if right_on is specified.") - - if left_on is not None: - if not is_list_like(left_on): - left_on = [left_on] - - if next((True for key in left_on if key not in self), False): - raise KeyError(next(key for key in left_on - if key not in self)) - - if right_on is not None: - if not is_list_like(right_on): - right_on = [right_on] - - if next((True for key in right_on if key not in right), False): - raise KeyError(next(key for key in right_on - if key not in right)) - - # There's a small chance that our partitions are already perfect, but - # if it's not, we need to adjust them. We adjust the right against the - # left because the defaults of merge rely on the order of the left. We - # have to push the index down here, so if we're joining on the right's - # index we go ahead and push it down here too. - if not np.array_equal(self._row_metadata._lengths, - right._row_metadata._lengths) or right_index: - - repartitioned_right = np.array([_match_partitioning._submit( - args=(df, self._row_metadata._lengths, right.index), - num_return_vals=len(self._row_metadata._lengths)) - for df in right._col_partitions]).T - else: - repartitioned_right = right._block_partitions - - if not left_index and not right_index: - # Passing None to each call specifies that we don't care about the - # left's index for the join. - left_idx = itertools.repeat(None) - - # We only return the index if we need to update it, and that only - # happens when either left_index or right_index is True. We will - # use this value to add the return vals if we are getting an index - # back. - return_index = False - else: - # We build this to push the index down so that we can use it for - # the join. - left_idx = \ - (v.index for k, v in - self._row_metadata._coord_df.copy().groupby('partition')) - return_index = True - - new_blocks = \ - np.array([_co_op_helper._submit( - args=tuple([lambda x, y: x.merge(y, *args), - left_cols, right_cols, - len(self._block_partitions.T), next(left_idx)] + - np.concatenate(obj).tolist()), - num_return_vals=len(self._block_partitions.T) + return_index) - for obj in zip(self._block_partitions, - repartitioned_right)]) - - if not return_index: - # Default to RangeIndex if left_index and right_index both false. - new_index = None - else: - new_index_parts = new_blocks[:, -1] - new_index = _concat_index.remote(*new_index_parts) - new_blocks = new_blocks[:, :-1] - - return DataFrame(block_partitions=new_blocks, - columns=new_columns, - index=new_index) - - def min(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - """Perform min across the DataFrame. - - Args: - axis (int): The axis to take the min on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The min of the DataFrame. - """ - def remote_func(df): - return df.min(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, **kwargs) - - return self._arithmetic_helper(remote_func, axis, level) - - def mod(self, other, axis='columns', level=None, fill_value=None): - """Mods this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the mod against this. - axis: The axis to mod over. - level: The Multilevel index level to apply mod over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Mod applied. - """ - return self._operator_helper(pandas.DataFrame.mod, other, axis, level, - fill_value) - - def mode(self, axis=0, numeric_only=False): - """Perform mode across the DataFrame. - - Args: - axis (int): The axis to take the mode on. - numeric_only (bool): if True, only apply to numeric columns. - - Returns: - DataFrame: The mode of the DataFrame. - """ - axis = pandas.DataFrame()._get_axis_number(axis) - - def mode_helper(df): - mode_df = df.mode(axis=axis, numeric_only=numeric_only) - return mode_df, mode_df.shape[axis] - - def fix_length(df, *lengths): - max_len = max(lengths[0]) - df = df.reindex(pandas.RangeIndex(max_len), axis=axis) - return df - - parts = self._col_partitions if axis == 0 else self._row_partitions - - result = [_deploy_func._submit(args=(lambda df: mode_helper(df), - part), num_return_vals=2) - for part in parts] - - parts, lengths = [list(t) for t in zip(*result)] - - parts = [_deploy_func.remote( - lambda df, *l: fix_length(df, l), part, *lengths) - for part in parts] - - if axis == 0: - return DataFrame(col_partitions=parts, - columns=self.columns) - else: - return DataFrame(row_partitions=parts, - index=self.index) - - def mul(self, other, axis='columns', level=None, fill_value=None): - """Multiplies this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the multiply against this. - axis: The axis to multiply over. - level: The Multilevel index level to apply multiply over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Multiply applied. - """ - return self._operator_helper(pandas.DataFrame.mul, other, axis, level, - fill_value) - - def multiply(self, other, axis='columns', level=None, fill_value=None): - """Synonym for mul. - - Args: - other: The object to use to apply the multiply against this. - axis: The axis to multiply over. - level: The Multilevel index level to apply multiply over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Multiply applied. - """ - return self.mul(other, axis, level, fill_value) - - def ne(self, other, axis='columns', level=None): - """Checks element-wise that this is not equal to other. - - Args: - other: A DataFrame or Series or scalar to compare to. - axis: The axis to perform the ne over. - level: The Multilevel index level to apply ne over. - - Returns: - A new DataFrame filled with Booleans. - """ - return self._operator_helper(pandas.DataFrame.ne, other, axis, level) - - def nlargest(self, n, columns, keep='first'): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def notna(self): - """Perform notna across the DataFrame. - - Args: - None - - Returns: - Boolean DataFrame where value is False if corresponding - value is NaN, True otherwise - """ - new_block_partitions = np.array([_map_partitions( - lambda df: df.notna(), block) for block in self._block_partitions]) - - new_dtypes = pandas.Series([np.dtype("bool")] * len(self.columns), - index=self.columns) - - return DataFrame(block_partitions=new_block_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata, - dtypes_cache=new_dtypes) - - def notnull(self): - """Perform notnull across the DataFrame. - - Args: - None - - Returns: - Boolean DataFrame where value is False if corresponding - value is NaN, True otherwise - """ - new_block_partitions = np.array([_map_partitions( - lambda df: df.notnull(), block) - for block in self._block_partitions]) - - new_dtypes = pandas.Series([np.dtype("bool")] * len(self.columns), - index=self.columns) - - return DataFrame(block_partitions=new_block_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata, - dtypes_cache=new_dtypes) - - def nsmallest(self, n, columns, keep='first'): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def nunique(self, axis=0, dropna=True): - """Return Series with number of distinct - observations over requested axis. - - Args: - axis : {0 or 'index', 1 or 'columns'}, default 0 - dropna : boolean, default True - - Returns: - nunique : Series - """ - def remote_func(df): - return df.nunique(axis=axis, dropna=dropna) - - return self._arithmetic_helper(remote_func, axis) - - def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, - **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def pipe(self, func, *args, **kwargs): - """Apply func(self, *args, **kwargs) - - Args: - func: function to apply to the df. - args: positional arguments passed into ``func``. - kwargs: a dictionary of keyword arguments passed into ``func``. - - Returns: - object: the return type of ``func``. - """ - return com._pipe(self, func, *args, **kwargs) - - def pivot(self, index=None, columns=None, values=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def pivot_table(self, values=None, index=None, columns=None, - aggfunc='mean', fill_value=None, margins=False, - dropna=True, margins_name='All'): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def plot(self, x=None, y=None, kind='line', ax=None, subplots=False, - sharex=None, sharey=False, layout=None, figsize=None, - use_index=True, title=None, grid=None, legend=True, style=None, - logx=False, logy=False, loglog=False, xticks=None, yticks=None, - xlim=None, ylim=None, rot=None, fontsize=None, colormap=None, - table=False, yerr=None, xerr=None, secondary_y=False, - sort_columns=False, **kwds): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def pop(self, item): - """Pops an item from this DataFrame and returns it. - - Args: - item (str): Column label to be popped - - Returns: - A Series containing the popped values. Also modifies this - DataFrame. - """ - result = self[item] - del self[item] - return result - - def pow(self, other, axis='columns', level=None, fill_value=None): - """Pow this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the pow against this. - axis: The axis to pow over. - level: The Multilevel index level to apply pow over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Pow applied. - """ - return self._operator_helper(pandas.DataFrame.pow, other, axis, level, - fill_value) - - def prod(self, axis=None, skipna=None, level=None, numeric_only=None, - min_count=1, **kwargs): - """Return the product of the values for the requested axis - - Args: - axis : {index (0), columns (1)} - skipna : boolean, default True - level : int or level name, default None - numeric_only : boolean, default None - min_count : int, default 1 - - Returns: - prod : Series or DataFrame (if level specified) - """ - def remote_func(df): - return df.prod(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, min_count=min_count, - **kwargs) - - return self._arithmetic_helper(remote_func, axis, level) - - def product(self, axis=None, skipna=None, level=None, numeric_only=None, - min_count=1, **kwargs): - """Return the product of the values for the requested axis - - Args: - axis : {index (0), columns (1)} - skipna : boolean, default True - level : int or level name, default None - numeric_only : boolean, default None - min_count : int, default 1 - - Returns: - product : Series or DataFrame (if level specified) - """ - return self.prod(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, min_count=min_count, - **kwargs) - - def quantile(self, q=0.5, axis=0, numeric_only=True, - interpolation='linear'): - """Return values at the given quantile over requested axis, - a la numpy.percentile. - - Args: - q (float): 0 <= q <= 1, the quantile(s) to compute - axis (int): 0 or 'index' for row-wise, - 1 or 'columns' for column-wise - interpolation: {'linear', 'lower', 'higher', 'midpoint', 'nearest'} - Specifies which interpolation method to use - - Returns: - quantiles : Series or DataFrame - If q is an array, a DataFrame will be returned where the - index is q, the columns are the columns of self, and the - values are the quantiles. - - If q is a float, a Series will be returned where the - index is the columns of self and the values - are the quantiles. - """ - - def check_bad_dtype(t): - return t == np.dtype('O') or is_timedelta64_dtype(t) - - if not numeric_only: - # check if there are any object columns - if all(check_bad_dtype(t) for t in self.dtypes): - raise TypeError("can't multiply sequence by non-int of type " - "'float'") - else: - if next((True for t in self.dtypes if check_bad_dtype(t)), - False): - dtype = next(t for t in self.dtypes if check_bad_dtype(t)) - raise ValueError("Cannot compare type '{}' with type '{}'" - .format(type(dtype), float)) - else: - # Normally pandas returns this near the end of the quantile, but we - # can't afford the overhead of running the entire operation before - # we error. - if all(check_bad_dtype(t) for t in self.dtypes): - raise ValueError("need at least one array to concatenate") - - # check that all qs are between 0 and 1 - pandas.DataFrame()._check_percentile(q) - - def quantile_helper(df, base_object): - """Quantile to be run inside each partitoin. - - Args: - df: The DataFrame composing the partition. - base_object: An empty pandas.Series or pandas.DataFrame - depending on q. - - Returns: - A new Series or DataFrame depending on q. - """ - # This if call prevents ValueErrors with object only partitions - if (numeric_only and - all(dtype == np.dtype('O') or - is_timedelta64_dtype(dtype) - for dtype in df.dtypes)): - return base_object - else: - return df.quantile(q=q, axis=axis, numeric_only=numeric_only, - interpolation=interpolation) - - axis = pandas.DataFrame()._get_axis_number(axis) - - if isinstance(q, (pandas.Series, np.ndarray, pandas.Index, list)): - - q_index = pandas.Float64Index(q) - - if axis == 0: - new_partitions = _map_partitions( - lambda df: quantile_helper(df, pandas.DataFrame()), - self._col_partitions) - - # select only correct dtype columns - new_columns = self.dtypes[self.dtypes.apply( - lambda x: is_numeric_dtype(x))].index - - else: - new_partitions = _map_partitions( - lambda df: quantile_helper(df, pandas.DataFrame()), - self._row_partitions) - new_columns = self.index - - return DataFrame(col_partitions=new_partitions, - index=q_index, - columns=new_columns) - - else: - # When q is a single float, we return a Series, so using - # arithmetic_helper works well here. - result = self._arithmetic_helper( - lambda df: quantile_helper(df, pandas.Series()), axis) - result.name = q - return result - - def query(self, expr, inplace=False, **kwargs): - """Queries the Dataframe with a boolean expression - - Returns: - A new DataFrame if inplace=False - """ - self._validate_eval_query(expr, **kwargs) - - columns = self.columns - - def query_helper(df): - df = df.copy() - df.columns = columns - df.query(expr, inplace=True, **kwargs) - df.columns = pandas.RangeIndex(0, len(df.columns)) - return df - - new_rows = _map_partitions(query_helper, - self._row_partitions) - - if inplace: - self._update_inplace(row_partitions=new_rows, index=self.index) - else: - return DataFrame(row_partitions=new_rows, - col_metadata=self._col_metadata) - - def radd(self, other, axis='columns', level=None, fill_value=None): - return self.add(other, axis, level, fill_value) - - def rank(self, axis=0, method='average', numeric_only=None, - na_option='keep', ascending=True, pct=False): - - """ - Compute numerical data ranks (1 through n) along axis. - Equal values are assigned a rank that is the [method] of - the ranks of those values. - - Args: - axis (int): 0 or 'index' for row-wise, - 1 or 'columns' for column-wise - interpolation: {'average', 'min', 'max', 'first', 'dense'} - Specifies which method to use for equal vals - numeric_only (boolean) - Include only float, int, boolean data. - na_option: {'keep', 'top', 'bottom'} - Specifies how to handle NA options - ascending (boolean): - Decedes ranking order - pct (boolean): - Computes percentage ranking of data - Returns: - A new DataFrame - """ - - def rank_helper(df): - return df.rank(axis=axis, method=method, - numeric_only=numeric_only, - na_option=na_option, - ascending=ascending, pct=pct) - - axis = pandas.DataFrame()._get_axis_number(axis) - - if (axis == 1): - new_cols = self.dtypes[self.dtypes.apply( - lambda x: is_numeric_dtype(x))].index - result = _map_partitions(rank_helper, - self._row_partitions) - return DataFrame(row_partitions=result, - columns=new_cols, - index=self.index) - - if (axis == 0): - result = _map_partitions(rank_helper, - self._col_partitions) - return DataFrame(col_partitions=result, - columns=self.columns, - index=self.index) - - def rdiv(self, other, axis='columns', level=None, fill_value=None): - return self._single_df_op_helper( - lambda df: df.rdiv(other, axis, level, fill_value), - other, axis, level) - - def reindex(self, labels=None, index=None, columns=None, axis=None, - method=None, copy=True, level=None, fill_value=np.nan, - limit=None, tolerance=None): - if level is not None: - raise NotImplementedError( - "Multilevel Index not Implemented. " - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \ - else 0 - if axis == 0 and labels is not None: - index = labels - elif labels is not None: - columns = labels - - new_blocks = self._block_partitions - if index is not None: - old_index = self.index - new_blocks = np.array([reindex_helper._submit( - args=(old_index, index, 1, len(new_blocks), method, - fill_value, limit, tolerance) + tuple(block.tolist()), - num_return_vals=len(new_blocks)) - for block in new_blocks.T]).T - else: - index = self.index - - if columns is not None: - old_columns = self.columns - new_blocks = np.array([reindex_helper._submit( - args=(old_columns, columns, 0, new_blocks.shape[1], method, - fill_value, limit, tolerance) + tuple(block.tolist()), - num_return_vals=new_blocks.shape[1]) - for block in new_blocks]) - else: - columns = self.columns - - if copy: - return DataFrame(block_partitions=new_blocks, - index=index, - columns=columns) - - self._update_inplace(block_partitions=new_blocks, - index=index, - columns=columns) - - def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, - limit=None, fill_value=np.nan): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def reindex_like(self, other, method=None, copy=True, limit=None, - tolerance=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def rename(self, mapper=None, index=None, columns=None, axis=None, - copy=True, inplace=False, level=None): - """Alters axes labels. - - Args: - mapper, index, columns: Transformations to apply to the axis's - values. - axis: Axis to target with mapper. - copy: Also copy underlying data. - inplace: Whether to return a new DataFrame. - level: Only rename a specific level of a MultiIndex. - - Returns: - If inplace is False, a new DataFrame with the updated axes. - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - - # We have to do this with the args because of how rename handles - # kwargs. It doesn't ignore None values passed in, so we have to filter - # them ourselves. - args = locals() - kwargs = {k: v for k, v in args.items() - if v is not None and k != "self"} - # inplace should always be true because this is just a copy, and we - # will use the results after. - kwargs['inplace'] = True - - df_to_rename = pandas.DataFrame(index=self.index, columns=self.columns) - df_to_rename.rename(**kwargs) - - if inplace: - obj = self - else: - obj = self.copy() - - obj.index = df_to_rename.index - obj.columns = df_to_rename.columns - - if not inplace: - return obj - - def rename_axis(self, mapper, axis=0, copy=True, inplace=False): - axes_is_columns = axis == 1 or axis == "columns" - renamed = self if inplace else self.copy() - if axes_is_columns: - renamed.columns.name = mapper - else: - renamed.index.name = mapper - if not inplace: - return renamed - - def _set_axis_name(self, name, axis=0, inplace=False): - """Alter the name or names of the axis. - - Args: - name: Name for the Index, or list of names for the MultiIndex - axis: 0 or 'index' for the index; 1 or 'columns' for the columns - inplace: Whether to modify `self` directly or return a copy - - Returns: - Type of caller or None if inplace=True. - """ - axes_is_columns = axis == 1 or axis == "columns" - renamed = self if inplace else self.copy() - if axes_is_columns: - renamed.columns.set_names(name) - else: - renamed.index.set_names(name) - - if not inplace: - return renamed - - def reorder_levels(self, order, axis=0): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def replace(self, to_replace=None, value=None, inplace=False, limit=None, - regex=False, method='pad', axis=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, - label=None, convention='start', kind=None, loffset=None, - limit=None, base=0, on=None, level=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def reset_index(self, level=None, drop=False, inplace=False, col_level=0, - col_fill=''): - """Reset this index to default and create column from current index. - - Args: - level: Only remove the given levels from the index. Removes all - levels by default - drop: Do not try to insert index into dataframe columns. This - resets the index to the default integer index. - inplace: Modify the DataFrame in place (do not create a new object) - col_level : If the columns have multiple levels, determines which - level the labels are inserted into. By default it is inserted - into the first level. - col_fill: If the columns have multiple levels, determines how the - other levels are named. If None then the index name is - repeated. - - Returns: - A new DataFrame if inplace is False, None otherwise. - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - if inplace: - new_obj = self - else: - new_obj = self.copy() - - def _maybe_casted_values(index, labels=None): - if isinstance(index, pandas.PeriodIndex): - values = index.asobject.values - elif isinstance(index, pandas.DatetimeIndex) \ - and index.tz is not None: - values = index - else: - values = index.values - if values.dtype == np.object_: - values = lib.maybe_convert_objects(values) - - # if we have the labels, extract the values with a mask - if labels is not None: - mask = labels == -1 - - # we can have situations where the whole mask is -1, - # meaning there is nothing found in labels, so make all nan's - if mask.all(): - values = np.empty(len(mask)) - values.fill(np.nan) - else: - values = values.take(labels) - if mask.any(): - values, changed = maybe_upcast_putmask( - values, mask, np.nan) - return values - - # We're building a new default index dataframe for use later. - new_index = pandas.RangeIndex(len(self)) - if level is not None: - if not isinstance(level, (tuple, list)): - level = [level] - level = [self.index._get_level_number(lev) for lev in level] - if isinstance(self.index, pandas.MultiIndex): - if len(level) < self.index.nlevels: - new_index = self.index.droplevel(level) - - if not drop: - if isinstance(self.index, pandas.MultiIndex): - names = [n if n is not None else ('level_%d' % i) - for (i, n) in enumerate(self.index.names)] - to_insert = lzip(self.index.levels, self.index.labels) - else: - default = 'index' - i = 0 - while default in self: - default = 'level_{}'.format(i) - i += 1 - - names = ([default] if self.index.name is None - else [self.index.name]) - to_insert = ((self.index, None),) - - multi_col = isinstance(self.columns, pandas.MultiIndex) - for i, (lev, lab) in reversed(list(enumerate(to_insert))): - if not (level is None or i in level): - continue - name = names[i] - if multi_col: - col_name = (list(name) if isinstance(name, tuple) - else [name]) - if col_fill is None: - if len(col_name) not in (1, self.columns.nlevels): - raise ValueError("col_fill=None is incompatible " - "with incomplete column name " - "{}".format(name)) - col_fill = col_name[0] - - lev_num = self.columns._get_level_number(col_level) - name_lst = [col_fill] * lev_num + col_name - missing = self.columns.nlevels - len(name_lst) - name_lst += [col_fill] * missing - name = tuple(name_lst) - # to ndarray and maybe infer different dtype - level_values = _maybe_casted_values(lev, lab) - new_obj.insert(0, name, level_values) - - new_obj.index = new_index - - if not inplace: - return new_obj - - def rfloordiv(self, other, axis='columns', level=None, fill_value=None): - return self._single_df_op_helper( - lambda df: df.rfloordiv(other, axis, level, fill_value), - other, axis, level) - - def rmod(self, other, axis='columns', level=None, fill_value=None): - return self._single_df_op_helper( - lambda df: df.rmod(other, axis, level, fill_value), - other, axis, level) - - def rmul(self, other, axis='columns', level=None, fill_value=None): - return self.mul(other, axis, level, fill_value) - - def rolling(self, window, min_periods=None, freq=None, center=False, - win_type=None, on=None, axis=0, closed=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def round(self, decimals=0, *args, **kwargs): - new_block_partitions = np.array([_map_partitions( - lambda df: df.round(decimals=decimals, *args, **kwargs), block) - for block in self._block_partitions]) - - return DataFrame(block_partitions=new_block_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata) - - def rpow(self, other, axis='columns', level=None, fill_value=None): - return self._single_df_op_helper( - lambda df: df.rpow(other, axis, level, fill_value), - other, axis, level) - - def rsub(self, other, axis='columns', level=None, fill_value=None): - return self._single_df_op_helper( - lambda df: df.rsub(other, axis, level, fill_value), - other, axis, level) - - def rtruediv(self, other, axis='columns', level=None, fill_value=None): - return self._single_df_op_helper( - lambda df: df.rtruediv(other, axis, level, fill_value), - other, axis, level) - - def sample(self, n=None, frac=None, replace=False, weights=None, - random_state=None, axis=None): - """Returns a random sample of items from an axis of object. - - Args: - n: Number of items from axis to return. Cannot be used with frac. - Default = 1 if frac = None. - frac: Fraction of axis items to return. Cannot be used with n. - replace: Sample with or without replacement. Default = False. - weights: Default 'None' results in equal probability weighting. - If passed a Series, will align with target object on index. - Index values in weights not found in sampled object will be - ignored and index values in sampled object not in weights will - be assigned weights of zero. If called on a DataFrame, will - accept the name of a column when axis = 0. Unless weights are - a Series, weights must be same length as axis being sampled. - If weights do not sum to 1, they will be normalized to sum - to 1. Missing values in the weights column will be treated as - zero. inf and -inf values not allowed. - random_state: Seed for the random number generator (if int), or - numpy RandomState object. - axis: Axis to sample. Accepts axis number or name. - - Returns: - A new Dataframe - """ - - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \ - else 0 - - if axis == 0: - axis_length = len(self._row_metadata) - else: - axis_length = len(self._col_metadata) - - if weights is not None: - - # Index of the weights Series should correspond to the index of the - # Dataframe in order to sample - if isinstance(weights, pandas.Series): - weights = weights.reindex(self.axes[axis]) - - # If weights arg is a string, the weights used for sampling will - # the be values in the column corresponding to that string - if isinstance(weights, string_types): - if axis == 0: - try: - weights = self[weights] - except KeyError: - raise KeyError("String passed to weights not a " - "valid column") - else: - raise ValueError("Strings can only be passed to " - "weights when sampling from rows on " - "a DataFrame") - - weights = pandas.Series(weights, dtype='float64') - - if len(weights) != axis_length: - raise ValueError("Weights and axis to be sampled must be of " - "same length") - - if (weights == np.inf).any() or (weights == -np.inf).any(): - raise ValueError("weight vector may not include `inf` values") - - if (weights < 0).any(): - raise ValueError("weight vector many not include negative " - "values") - - # weights cannot be NaN when sampling, so we must set all nan - # values to 0 - weights = weights.fillna(0) - - # If passed in weights are not equal to 1, renormalize them - # otherwise numpy sampling function will error - weights_sum = weights.sum() - if weights_sum != 1: - if weights_sum != 0: - weights = weights / weights_sum - else: - raise ValueError("Invalid weights: weights sum to zero") - - weights = weights.values - - if n is None and frac is None: - # default to n = 1 if n and frac are both None (in accordance with - # Pandas specification) - n = 1 - elif n is not None and frac is None and n % 1 != 0: - # n must be an integer - raise ValueError("Only integers accepted as `n` values") - elif n is None and frac is not None: - # compute the number of samples based on frac - n = int(round(frac * axis_length)) - elif n is not None and frac is not None: - # Pandas specification does not allow both n and frac to be passed - # in - raise ValueError('Please enter a value for `frac` OR `n`, not ' - 'both') - if n < 0: - raise ValueError("A negative number of rows requested. Please " - "provide positive value.") - - if n == 0: - # An Empty DataFrame is returned if the number of samples is 0. - # The Empty Dataframe should have either columns or index specified - # depending on which axis is passed in. - return DataFrame(columns=[] if axis == 1 else self.columns, - index=self.index if axis == 1 else []) - - if axis == 1: - axis_labels = self.columns - partition_metadata = self._col_metadata - partitions = self._col_partitions - else: - axis_labels = self.index - partition_metadata = self._row_metadata - partitions = self._row_partitions - - if random_state is not None: - # Get a random number generator depending on the type of - # random_state that is passed in - if isinstance(random_state, int): - random_num_gen = np.random.RandomState(random_state) - elif isinstance(random_state, np.random.randomState): - random_num_gen = random_state - else: - # random_state must be an int or a numpy RandomState object - raise ValueError("Please enter an `int` OR a " - "np.random.RandomState for random_state") - - # choose random numbers and then get corresponding labels from - # chosen axis - sample_indices = random_num_gen.randint( - low=0, - high=len(partition_metadata), - size=n) - samples = axis_labels[sample_indices] - else: - # randomly select labels from chosen axis - samples = np.random.choice(a=axis_labels, size=n, - replace=replace, p=weights) - - # create an array of (partition, index_within_partition) tuples for - # each sample - part_ind_tuples = [partition_metadata[sample] - for sample in samples] - - if axis == 1: - # tup[0] refers to the partition number and tup[1] is the index - # within that partition - new_cols = [_deploy_func.remote(lambda df: df.iloc[:, [tup[1]]], - partitions[tup[0]]) for tup in part_ind_tuples] - return DataFrame(col_partitions=new_cols, - columns=samples, - index=self.index) - else: - new_rows = [_deploy_func.remote(lambda df: df.loc[[tup[1]]], - partitions[tup[0]]) for tup in part_ind_tuples] - return DataFrame(row_partitions=new_rows, - columns=self.columns, - index=samples) - - def select(self, crit, axis=0): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def select_dtypes(self, include=None, exclude=None): - # Validates arguments for whether both include and exclude are None or - # if they are disjoint. Also invalidates string dtypes. - pandas.DataFrame().select_dtypes(include, exclude) - - if include and not is_list_like(include): - include = [include] - elif not include: - include = [] - - if exclude and not is_list_like(exclude): - exclude = [exclude] - elif not exclude: - exclude = [] - - sel = tuple(map(set, (include, exclude))) - - include, exclude = map( - lambda x: set(map(_get_dtype_from_object, x)), sel) - - include_these = pandas.Series(not bool(include), index=self.columns) - exclude_these = pandas.Series(not bool(exclude), index=self.columns) - - def is_dtype_instance_mapper(column, dtype): - return column, functools.partial(issubclass, dtype.type) - - for column, f in itertools.starmap(is_dtype_instance_mapper, - self.dtypes.iteritems()): - if include: # checks for the case of empty include or exclude - include_these[column] = any(map(f, include)) - if exclude: - exclude_these[column] = not any(map(f, exclude)) - - dtype_indexer = include_these & exclude_these - indicate = [i for i in range(len(dtype_indexer.values)) - if not dtype_indexer.values[i]] - return self.drop(columns=self.columns[indicate], inplace=False) - - def sem(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def set_axis(self, labels, axis=0, inplace=None): - """Assign desired index to given axis. - - Args: - labels (pandas.Index or list-like): The Index to assign. - axis (string or int): The axis to reassign. - inplace (bool): Whether to make these modifications inplace. - - Returns: - If inplace is False, returns a new DataFrame, otherwise None. - """ - if is_scalar(labels): - warnings.warn( - 'set_axis now takes "labels" as first argument, and ' - '"axis" as named parameter. The old form, with "axis" as ' - 'first parameter and \"labels\" as second, is still supported ' - 'but will be deprecated in a future version of pandas.', - FutureWarning, stacklevel=2) - labels, axis = axis, labels - - if inplace is None: - warnings.warn( - 'set_axis currently defaults to operating inplace.\nThis ' - 'will change in a future version of pandas, use ' - 'inplace=True to avoid this warning.', - FutureWarning, stacklevel=2) - inplace = True - if inplace: - setattr(self, pandas.DataFrame()._get_axis_name(axis), labels) - else: - obj = self.copy() - obj.set_axis(labels, axis=axis, inplace=True) - return obj - - def set_index(self, keys, drop=True, append=False, inplace=False, - verify_integrity=False): - """Set the DataFrame index using one or more existing columns. - - Args: - keys: column label or list of column labels / arrays. - drop (boolean): Delete columns to be used as the new index. - append (boolean): Whether to append columns to existing index. - inplace (boolean): Modify the DataFrame in place. - verify_integrity (boolean): Check the new index for duplicates. - Otherwise defer the check until necessary. Setting to False - will improve the performance of this method - - Returns: - If inplace is set to false returns a new DataFrame, otherwise None. - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - if not isinstance(keys, list): - keys = [keys] - - if inplace: - frame = self - else: - frame = self.copy() - - arrays = [] - names = [] - if append: - names = [x for x in self.index.names] - if isinstance(self.index, pandas.MultiIndex): - for i in range(self.index.nlevels): - arrays.append(self.index._get_level_values(i)) - else: - arrays.append(self.index) - - to_remove = [] - for col in keys: - if isinstance(col, pandas.MultiIndex): - # append all but the last column so we don't have to modify - # the end of this loop - for n in range(col.nlevels - 1): - arrays.append(col._get_level_values(n)) - - level = col._get_level_values(col.nlevels - 1) - names.extend(col.names) - elif isinstance(col, pandas.Series): - level = col._values - names.append(col.name) - elif isinstance(col, pandas.Index): - level = col - names.append(col.name) - elif isinstance(col, (list, np.ndarray, pandas.Index)): - level = col - names.append(None) - else: - level = frame[col]._values - names.append(col) - if drop: - to_remove.append(col) - arrays.append(level) - - index = _ensure_index_from_sequences(arrays, names) - - if verify_integrity and not index.is_unique: - duplicates = index.get_duplicates() - raise ValueError('Index has duplicate keys: %s' % duplicates) - - for c in to_remove: - del frame[c] - - # clear up memory usage - index._cleanup() - - frame.index = index - - if not inplace: - return frame - - def set_value(self, index, col, value, takeable=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def shift(self, periods=1, freq=None, axis=0): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - """Return unbiased skew over requested axis Normalized by N-1 - - Args: - axis : {index (0), columns (1)} - skipna : boolean, default True - Exclude NA/null values when computing the result. - level : int or level name, default None - numeric_only : boolean, default None - - Returns: - skew : Series or DataFrame (if level specified) - """ - def remote_func(df): - return df.skew(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only, **kwargs) - - return self._arithmetic_helper(remote_func, axis, level) - - def slice_shift(self, periods=1, axis=0): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def sort_index(self, axis=0, level=None, ascending=True, inplace=False, - kind='quicksort', na_position='last', sort_remaining=True, - by=None): - """Sort a DataFrame by one of the indices (columns or index). - - Args: - axis: The axis to sort over. - level: The MultiIndex level to sort over. - ascending: Ascending or descending - inplace: Whether or not to update this DataFrame inplace. - kind: How to perform the sort. - na_position: Where to position NA on the sort. - sort_remaining: On Multilevel Index sort based on all levels. - by: (Deprecated) argument to pass to sort_values. - - Returns: - A sorted DataFrame - """ - if level is not None: - raise NotImplementedError("Multilevel index not yet implemented.") - - if by is not None: - warnings.warn("by argument to sort_index is deprecated, " - "please use .sort_values(by=...)", - FutureWarning, stacklevel=2) - if level is not None: - raise ValueError("unable to simultaneously sort by and level") - return self.sort_values(by, axis=axis, ascending=ascending, - inplace=inplace) - - axis = pandas.DataFrame()._get_axis_number(axis) - - args = (axis, level, ascending, False, kind, na_position, - sort_remaining) - - def _sort_helper(df, index, axis, *args): - if axis == 0: - df.index = index - else: - df.columns = index - - result = df.sort_index(*args) - df.reset_index(drop=True, inplace=True) - df.columns = pandas.RangeIndex(len(df.columns)) - return result - - if axis == 0: - index = self.index - new_column_parts = _map_partitions( - lambda df: _sort_helper(df, index, axis, *args), - self._col_partitions) - - new_columns = self.columns - new_index = self.index.sort_values(ascending=ascending) - new_row_parts = None - else: - columns = self.columns - new_row_parts = _map_partitions( - lambda df: _sort_helper(df, columns, axis, *args), - self._row_partitions) - - new_columns = self.columns.sort_values(ascending=ascending) - new_index = self.index - new_column_parts = None - - if not inplace: - return DataFrame(col_partitions=new_column_parts, - row_partitions=new_row_parts, - index=new_index, - columns=new_columns) - else: - self._update_inplace(row_partitions=new_row_parts, - col_partitions=new_column_parts, - columns=new_columns, - index=new_index) - - def sort_values(self, by, axis=0, ascending=True, inplace=False, - kind='quicksort', na_position='last'): - """Sorts by a column/row or list of columns/rows. - - Args: - by: A list of labels for the axis to sort over. - axis: The axis to sort. - ascending: Sort in ascending or descending order. - inplace: If true, do the operation inplace. - kind: How to sort. - na_position: Where to put np.nan values. - - Returns: - A sorted DataFrame. - """ - - axis = pandas.DataFrame()._get_axis_number(axis) - - if not is_list_like(by): - by = [by] - - if axis == 0: - broadcast_value_dict = {str(col): self[col] for col in by} - broadcast_values = pandas.DataFrame(broadcast_value_dict) - else: - broadcast_value_list = [to_pandas(self[row::len(self.index)]) - for row in by] - - index_builder = list(zip(broadcast_value_list, by)) - - for row, idx in index_builder: - row.index = [str(idx)] - - broadcast_values = \ - pandas.concat([row for row, idx in index_builder], copy=False) - - # We are converting the by to string here so that we don't have a - # collision with the RangeIndex on the inner frame. It is cheap and - # gaurantees that we sort by the correct column. - by = [str(col) for col in by] - - args = (by, axis, ascending, False, kind, na_position) - - def _sort_helper(df, broadcast_values, axis, *args): - """Sorts the data on a partition. - - Args: - df: The DataFrame to sort. - broadcast_values: The by DataFrame to use for the sort. - axis: The axis to sort over. - args: The args for the sort. - - Returns: - A new sorted DataFrame. - """ - if axis == 0: - broadcast_values.index = df.index - names = broadcast_values.columns - else: - broadcast_values.columns = df.columns - names = broadcast_values.index - - return pandas.concat([df, broadcast_values], axis=axis ^ 1, - copy=False).sort_values(*args) \ - .drop(names, axis=axis ^ 1) - - if axis == 0: - new_column_partitions = _map_partitions( - lambda df: _sort_helper(df, broadcast_values, axis, *args), - self._col_partitions) - - new_row_partitions = None - new_columns = self.columns - - # This is important because it allows us to get the axis that we - # aren't sorting over. We need the order of the columns/rows and - # this will provide that in the return value. - new_index = broadcast_values.sort_values(*args).index - else: - new_row_partitions = _map_partitions( - lambda df: _sort_helper(df, broadcast_values, axis, *args), - self._row_partitions) - - new_column_partitions = None - new_columns = broadcast_values.sort_values(*args).columns - new_index = self.index - - if inplace: - self._update_inplace(row_partitions=new_row_partitions, - col_partitions=new_column_partitions, - columns=new_columns, - index=new_index) - else: - return DataFrame(row_partitions=new_row_partitions, - col_partitions=new_column_partitions, - columns=new_columns, - index=new_index, - dtypes_cache=self._dtypes_cache) - - def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, - sort_remaining=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def squeeze(self, axis=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def stack(self, level=-1, dropna=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def std(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): - """Computes standard deviation across the DataFrame. - - Args: - axis (int): The axis to take the std on. - skipna (bool): True to skip NA values, false otherwise. - ddof (int): degrees of freedom - - Returns: - The std of the DataFrame (Pandas Series) - """ - def remote_func(df): - return df.std(axis=axis, skipna=skipna, level=level, ddof=ddof, - numeric_only=numeric_only, **kwargs) - - return self._arithmetic_helper(remote_func, axis, level) - - def sub(self, other, axis='columns', level=None, fill_value=None): - """Subtract a DataFrame/Series/scalar from this DataFrame. - - Args: - other: The object to use to apply the subtraction to this. - axis: THe axis to apply the subtraction over. - level: Mutlilevel index level to subtract over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the subtraciont applied. - """ - return self._operator_helper(pandas.DataFrame.sub, other, axis, level, - fill_value) - - def subtract(self, other, axis='columns', level=None, fill_value=None): - """Alias for sub. - - Args: - other: The object to use to apply the subtraction to this. - axis: THe axis to apply the subtraction over. - level: Mutlilevel index level to subtract over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the subtraciont applied. - """ - return self.sub(other, axis, level, fill_value) - - def swapaxes(self, axis1, axis2, copy=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def swaplevel(self, i=-2, j=-1, axis=0): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def tail(self, n=5): - """Get the last n rows of the dataframe. - - Args: - n (int): The number of rows to return. - - Returns: - A new dataframe with the last n rows of this dataframe. - """ - if n >= len(self._row_metadata): - return self - - new_dfs = _map_partitions(lambda df: df.tail(n), - self._col_partitions) - - index = self._row_metadata.index[-n:] - return DataFrame(col_partitions=new_dfs, - col_metadata=self._col_metadata, - index=index, - dtypes_cache=self._dtypes_cache) - - def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def to_clipboard(self, excel=None, sep=None, **kwargs): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = to_pandas(self) - port_frame.to_clipboard(excel, sep, **kwargs) - - def to_csv(self, path_or_buf=None, sep=",", na_rep="", float_format=None, - columns=None, header=True, index=True, index_label=None, - mode="w", encoding=None, compression=None, quoting=None, - quotechar='"', line_terminator="\n", chunksize=None, - tupleize_cols=None, date_format=None, doublequote=True, - escapechar=None, decimal="."): - - kwargs = { - 'path_or_buf': path_or_buf, - 'sep': sep, - 'na_rep': na_rep, - 'float_format': float_format, - 'columns': columns, - 'header': header, - 'index': index, - 'index_label': index_label, - 'mode': mode, - 'encoding': encoding, - 'compression': compression, - 'quoting': quoting, - 'quotechar': quotechar, - 'line_terminator': line_terminator, - 'chunksize': chunksize, - 'tupleize_cols': tupleize_cols, - 'date_format': date_format, - 'doublequote': doublequote, - 'escapechar': escapechar, - 'decimal': decimal - } - - if compression is not None: - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - return to_pandas(self).to_csv(**kwargs) - - if tupleize_cols is not None: - warnings.warn("The 'tupleize_cols' parameter is deprecated and " - "will be removed in a future version", - FutureWarning, stacklevel=2) - else: - tupleize_cols = False - - remote_kwargs_id = ray.put(dict(kwargs, path_or_buf=None)) - columns_id = ray.put(self.columns) - - def get_csv_str(df, index, columns, header, kwargs): - df.index = index - df.columns = columns - kwargs["header"] = header - return df.to_csv(**kwargs) - - idxs = [0] + np.cumsum(self._row_metadata._lengths).tolist() - idx_args = [self.index[idxs[i]:idxs[i + 1]] - for i in range(len(self._row_partitions))] - csv_str_ids = _map_partitions( - get_csv_str, self._row_partitions, idx_args, - [columns_id] * len(self._row_partitions), - [header] + [False] * (len(self._row_partitions) - 1), - [remote_kwargs_id] * len(self._row_partitions)) - - if path_or_buf is None: - buf = io.StringIO() - elif isinstance(path_or_buf, str): - buf = open(path_or_buf, mode) - else: - buf = path_or_buf - - for csv_str_id in csv_str_ids: - buf.write(ray.get(csv_str_id)) - buf.flush() - - result = None - if path_or_buf is None: - result = buf.getvalue() - buf.close() - elif isinstance(path_or_buf, str): - buf.close() - return result - - def to_dense(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def to_dict(self, orient='dict', into=dict): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', - float_format=None, columns=None, header=True, index=True, - index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep='inf', verbose=True, - freeze_panes=None): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = to_pandas(self) - port_frame.to_excel(excel_writer, sheet_name, na_rep, - float_format, columns, header, index, - index_label, startrow, startcol, engine, - merge_cells, encoding, inf_rep, verbose, - freeze_panes) - - def to_feather(self, fname): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = to_pandas(self) - port_frame.to_feather(fname) - - def to_gbq(self, destination_table, project_id, chunksize=10000, - verbose=True, reauth=False, if_exists='fail', - private_key=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def to_hdf(self, path_or_buf, key, **kwargs): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = to_pandas(self) - port_frame.to_hdf(path_or_buf, key, **kwargs) - - def to_html(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='np.NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - justify=None, bold_rows=True, classes=None, escape=True, - max_rows=None, max_cols=None, show_dimensions=False, - notebook=False, decimal='.', border=None): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = to_pandas(self) - port_frame.to_html(buf, columns, col_space, header, - index, na_rep, formatters, - float_format, sparsify, index_names, - justify, bold_rows, classes, escape, - max_rows, max_cols, show_dimensions, - notebook, decimal, border) - - def to_json(self, path_or_buf=None, orient=None, date_format=None, - double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None, lines=False, compression=None): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = to_pandas(self) - port_frame.to_json(path_or_buf, orient, date_format, - double_precision, force_ascii, date_unit, - default_handler, lines, compression) - - def to_latex(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='np.NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - bold_rows=False, column_format=None, longtable=None, - escape=None, encoding=None, decimal='.', multicolumn=None, - multicolumn_format=None, multirow=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = to_pandas(self) - port_frame.to_msgpack(path_or_buf, encoding, **kwargs) - - def to_panel(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def to_parquet(self, fname, engine='auto', compression='snappy', - **kwargs): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = to_pandas(self) - port_frame.to_parquet(fname, engine, compression, **kwargs) - - def to_period(self, freq=None, axis=0, copy=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def to_pickle(self, path, compression='infer', - protocol=pkl.HIGHEST_PROTOCOL): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = to_pandas(self) - port_frame.to_pickle(path, compression, protocol) - - def to_records(self, index=True, convert_datetime64=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def to_sparse(self, fill_value=None, kind='block'): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail', - index=True, index_label=None, chunksize=None, dtype=None): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = to_pandas(self) - port_frame.to_sql(name, con, flavor, schema, if_exists, - index, index_label, chunksize, dtype) - - def to_stata(self, fname, convert_dates=None, write_index=True, - encoding='latin-1', byteorder=None, time_stamp=None, - data_label=None, variable_labels=None): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = to_pandas(self) - port_frame.to_stata(fname, convert_dates, write_index, - encoding, byteorder, time_stamp, - data_label, variable_labels) - - def to_string(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='np.NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - justify=None, line_width=None, max_rows=None, max_cols=None, - show_dimensions=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def to_timestamp(self, freq=None, how='start', axis=0, copy=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def to_xarray(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def transform(self, func, *args, **kwargs): - kwargs["is_transform"] = True - result = self.agg(func, *args, **kwargs) - try: - result.columns = self.columns - result.index = self.index - except ValueError: - raise ValueError("transforms cannot produce aggregated results") - return result - - def truediv(self, other, axis='columns', level=None, fill_value=None): - """Divides this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the divide against this. - axis: The axis to divide over. - level: The Multilevel index level to apply divide over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Divide applied. - """ - return self._operator_helper(pandas.DataFrame.truediv, other, axis, - level, fill_value) - - def truncate(self, before=None, after=None, axis=None, copy=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def tshift(self, periods=1, freq=None, axis=0): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def tz_convert(self, tz, axis=0, level=None, copy=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def tz_localize(self, tz, axis=0, level=None, copy=True, - ambiguous='raise'): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def unstack(self, level=-1, fill_value=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def update(self, other, join='left', overwrite=True, filter_func=None, - raise_conflict=False): - """Modify DataFrame in place using non-NA values from other. - - Args: - other: DataFrame, or object coercible into a DataFrame - join: {'left'}, default 'left' - overwrite: If True then overwrite values for common keys in frame - filter_func: Can choose to replace values other than NA. - raise_conflict: If True, will raise an error if the DataFrame and - other both contain data in the same place. - - Returns: - None - """ - if raise_conflict: - raise NotImplementedError( - "raise_conflict parameter not yet supported. " - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - if not isinstance(other, DataFrame): - other = DataFrame(other) - - def update_helper(x, y): - x.update(y, join, overwrite, filter_func, False) - return x - - self._inter_df_op_helper(update_helper, other, join, 0, None, - inplace=True) - - def var(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): - """Computes variance across the DataFrame. - - Args: - axis (int): The axis to take the variance on. - skipna (bool): True to skip NA values, false otherwise. - ddof (int): degrees of freedom - - Returns: - The variance of the DataFrame. - """ - def remote_func(df): - return df.var(axis=axis, skipna=skipna, level=level, ddof=ddof, - numeric_only=numeric_only, **kwargs) - - return self._arithmetic_helper(remote_func, axis, level) - - def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, - errors='raise', try_cast=False, raise_on_error=None): - """Replaces values not meeting condition with values in other. - - Args: - cond: A condition to be met, can be callable, array-like or a - DataFrame. - other: A value or DataFrame of values to use for setting this. - inplace: Whether or not to operate inplace. - axis: The axis to apply over. Only valid when a Series is passed - as other. - level: The MultiLevel index level to apply over. - errors: Whether or not to raise errors. Does nothing in Pandas. - try_cast: Try to cast the result back to the input type. - raise_on_error: Whether to raise invalid datatypes (deprecated). - - Returns: - A new DataFrame with the replaced values. - """ - - inplace = validate_bool_kwarg(inplace, 'inplace') - - if isinstance(other, pandas.Series) and axis is None: - raise ValueError("Must specify axis=0 or 1") - - if level is not None: - raise NotImplementedError("Multilevel Index not yet supported on " - "Pandas on Ray.") - - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \ - else 0 - - cond = cond(self) if callable(cond) else cond - - if not isinstance(cond, DataFrame): - if not hasattr(cond, 'shape'): - cond = np.asanyarray(cond) - if cond.shape != self.shape: - raise ValueError("Array conditional must be same shape as " - "self") - cond = DataFrame(cond, index=self.index, columns=self.columns) - - zipped_partitions = self._copartition(cond, self.index) - args = (False, axis, level, errors, try_cast, raise_on_error) - - if isinstance(other, DataFrame): - other_zipped = (v for k, v in self._copartition(other, - self.index)) - - new_partitions = [_where_helper.remote(k, v, next(other_zipped), - self.columns, cond.columns, - other.columns, *args) - for k, v in zipped_partitions] - - # Series has to be treated specially because we're operating on row - # partitions from here on. - elif isinstance(other, pandas.Series): - if axis == 0: - # Pandas determines which index to use based on axis. - other = other.reindex(self.index) - other.index = pandas.RangeIndex(len(other)) - - # Since we're working on row partitions, we have to partition - # the Series based on the partitioning of self (since both - # self and cond are co-partitioned by self. - other_builder = [] - for length in self._row_metadata._lengths: - other_builder.append(other[:length]) - other = other[length:] - # Resetting the index here ensures that we apply each part - # to the correct row within the partitions. - other.index = pandas.RangeIndex(len(other)) - - other = (obj for obj in other_builder) - - new_partitions = [_where_helper.remote(k, v, - next(other, - pandas.Series()), - self.columns, - cond.columns, - None, *args) - for k, v in zipped_partitions] - else: - other = other.reindex(self.columns) - new_partitions = [_where_helper.remote(k, v, other, - self.columns, - cond.columns, - None, *args) - for k, v in zipped_partitions] - - else: - new_partitions = [_where_helper.remote(k, v, other, self.columns, - cond.columns, None, *args) - for k, v in zipped_partitions] - - if inplace: - self._update_inplace(row_partitions=new_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata) - else: - return DataFrame(row_partitions=new_partitions, - row_metadata=self._row_metadata, - col_metadata=self._col_metadata) - - def xs(self, key, axis=0, level=None, drop_level=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __getitem__(self, key): - """Get the column specified by key for this DataFrame. - - Args: - key : The column name. - - Returns: - A Pandas Series representing the value for the column. - """ - key = com._apply_if_callable(key, self) - - # shortcut if we are an actual column - is_mi_columns = isinstance(self.columns, pandas.MultiIndex) - try: - if key in self.columns and not is_mi_columns: - return self._getitem_column(key) - except (KeyError, ValueError, TypeError): - pass - - # see if we can slice the rows - indexer = self._row_metadata.convert_to_index_sliceable(key) - if indexer is not None: - return self._getitem_slice(indexer) - - if isinstance(key, (pandas.Series, np.ndarray, pandas.Index, list)): - return self._getitem_array(key) - elif isinstance(key, DataFrame): - raise NotImplementedError("To contribute to Pandas on Ray, please" - "visit github.com/ray-project/ray.") - # return self._getitem_frame(key) - elif is_mi_columns: - raise NotImplementedError("To contribute to Pandas on Ray, please" - "visit github.com/ray-project/ray.") - # return self._getitem_multilevel(key) - else: - return self._getitem_column(key) - - def _getitem_column(self, key): - # may result in multiple columns? - partition = self._col_metadata[key, 'partition'] - result = ray.get(self._getitem_indiv_col(key, partition)) - result.name = key - result.index = self.index - return result - - def _getitem_array(self, key): - if com.is_bool_indexer(key): - if isinstance(key, pandas.Series) and \ - not key.index.equals(self.index): - warnings.warn("Boolean Series key will be reindexed to match " - "DataFrame index.", UserWarning, stacklevel=3) - elif len(key) != len(self.index): - raise ValueError('Item wrong length {} instead of {}.'.format( - len(key), len(self.index))) - key = check_bool_indexer(self.index, key) - - new_parts = _map_partitions(lambda df: df[key], - self._col_partitions) - columns = self.columns - index = self.index[key] - - return DataFrame(col_partitions=new_parts, - columns=columns, - index=index) - else: - columns = self._col_metadata[key].index - column_indices = {item: i for i, item in enumerate(self.columns)} - indices_for_rows = [column_indices[column] for column in columns] - - def get_columns_partition(df): - result = df.__getitem__(indices_for_rows), - result.columns = pandas.RangeIndex(0, len(result.columns)) - return result - - new_parts = [_deploy_func.remote( - lambda df: df.__getitem__(indices_for_rows), - part) for part in self._row_partitions] - - index = self.index - - return DataFrame(row_partitions=new_parts, - columns=columns, - index=index) - - def _getitem_indiv_col(self, key, part): - loc = self._col_metadata[key] - if isinstance(loc, pandas.Series): - index = loc[loc['partition'] == part] - else: - index = loc[loc['partition'] == part]['index_within_partition'] - return _deploy_func.remote( - lambda df: df.__getitem__(index), - self._col_partitions[part]) - - def _getitem_slice(self, key): - new_cols = _map_partitions(lambda df: df[key], - self._col_partitions) - - index = self.index[key] - return DataFrame(col_partitions=new_cols, - col_metadata=self._col_metadata, - index=index) - - def __getattr__(self, key): - """After regular attribute access, looks up the name in the columns - - Args: - key (str): Attribute name. - - Returns: - The value of the attribute. - """ - try: - return object.__getattribute__(self, key) - except AttributeError as e: - if key in self.columns: - return self[key] - raise e - - def __setitem__(self, key, value): - if not isinstance(key, str): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - if key not in self.columns: - self.insert(loc=len(self.columns), column=key, value=value) - else: - loc = self.columns.get_loc(key) - self.__delitem__(key) - self.insert(loc=loc, column=key, value=value) - - def __len__(self): - """Gets the length of the dataframe. - - Returns: - Returns an integer length of the dataframe object. - """ - return len(self._row_metadata) - - def __unicode__(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __invert__(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __hash__(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __iter__(self): - """Iterate over the columns - - Returns: - An Iterator over the columns of the dataframe. - """ - return iter(self.columns) - - def __contains__(self, key): - """Searches columns for specific key - - Args: - key : The column name - - Returns: - Returns a boolean if the specified key exists as a column name - """ - return self.columns.__contains__(key) - - def __nonzero__(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __bool__(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __abs__(self): - """Creates a modified DataFrame by taking the absolute value. - - Returns: - A modified DataFrame - """ - return self.abs() - - def __round__(self, decimals=0): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __array__(self, dtype=None): - # TODO: This is very inefficient and needs fix, also see as_matrix - return to_pandas(self).__array__(dtype=dtype) - - def __array_wrap__(self, result, context=None): - # TODO: This is very inefficient, see also __array__ and as_matrix - return to_pandas(self).__array_wrap__(result, context=context) - - def __getstate__(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __setstate__(self, state): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __delitem__(self, key): - """Delete a column by key. `del a[key]` for example. - Operation happens in place. - - Notes: This operation happen on row and column partition - simultaneously. No rebuild. - Args: - key: key to delete - """ - # Create helper method for deleting column(s) in row partition. - def del_helper(df, to_delete): - cols = df.columns[to_delete] # either int or an array of ints - - if not is_list_like(cols): - cols = [cols] - - for col in cols: - df.__delitem__(col) - - # Reset the column index to conserve space - df.columns = pandas.RangeIndex(0, len(df.columns)) - return df - - # This structure is used to get the correct index inside the partition. - del_df = self._col_metadata[key] - - # We need to standardize between multiple and single occurrences in the - # columns. Putting single occurrences in a pandas.DataFrame and - # transposing results in the same structure as multiple with 'loc'. - if isinstance(del_df, pandas.Series): - del_df = pandas.DataFrame(del_df).T - - # Cast cols as pandas.Series as duplicate columns mean result may be - # np.int64 or pandas.Series - col_parts_to_del = \ - pandas.Series(del_df['partition'].copy()).unique() - self._col_metadata.drop(key) - - for i in col_parts_to_del: - # Compute the correct index inside the partition to delete. - to_delete_in_partition = \ - del_df[del_df['partition'] == i]['index_within_partition'] - - for j in range(self._block_partitions.shape[0]): - self._block_partitions[j, i] = _deploy_func.remote( - del_helper, self._block_partitions[j, i], - to_delete_in_partition) - - self._col_metadata.reset_partition_coords(col_parts_to_del) - - def __finalize__(self, other, method=None, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __copy__(self, deep=True): - """Make a copy using Ray.DataFrame.copy method - - Args: - deep: Boolean, deep copy or not. - Currently we do not support deep copy. - - Returns: - A Ray DataFrame object. - """ - return self.copy(deep=deep) - - def __deepcopy__(self, memo=None): - """Make a -deep- copy using Ray.DataFrame.copy method - This is equivalent to copy(deep=True). - - Args: - memo: No effect. Just to comply with Pandas API. - - Returns: - A Ray DataFrame object. - """ - return self.copy(deep=True) - - def __and__(self, other): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __or__(self, other): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __xor__(self, other): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __lt__(self, other): - return self.lt(other) - - def __le__(self, other): - return self.le(other) - - def __gt__(self, other): - return self.gt(other) - - def __ge__(self, other): - return self.ge(other) - - def __eq__(self, other): - return self.eq(other) - - def __ne__(self, other): - return self.ne(other) - - def __add__(self, other): - return self.add(other) - - def __iadd__(self, other): - return self.add(other) - - def __radd__(self, other, axis="columns", level=None, fill_value=None): - return self.radd(other, axis, level, fill_value) - - def __mul__(self, other): - return self.mul(other) - - def __imul__(self, other): - return self.mul(other) - - def __rmul__(self, other, axis="columns", level=None, fill_value=None): - return self.rmul(other, axis, level, fill_value) - - def __pow__(self, other): - return self.pow(other) - - def __ipow__(self, other): - return self.pow(other) - - def __rpow__(self, other, axis="columns", level=None, fill_value=None): - return self.rpow(other, axis, level, fill_value) - - def __sub__(self, other): - return self.sub(other) - - def __isub__(self, other): - return self.sub(other) - - def __rsub__(self, other, axis="columns", level=None, fill_value=None): - return self.rsub(other, axis, level, fill_value) - - def __floordiv__(self, other): - return self.floordiv(other) - - def __ifloordiv__(self, other): - return self.floordiv(other) - - def __rfloordiv__(self, other, axis="columns", level=None, - fill_value=None): - return self.rfloordiv(other, axis, level, fill_value) - - def __truediv__(self, other): - return self.truediv(other) - - def __itruediv__(self, other): - return self.truediv(other) - - def __rtruediv__(self, other, axis="columns", level=None, fill_value=None): - return self.rtruediv(other, axis, level, fill_value) - - def __mod__(self, other): - return self.mod(other) - - def __imod__(self, other): - return self.mod(other) - - def __rmod__(self, other, axis="columns", level=None, fill_value=None): - return self.rmod(other, axis, level, fill_value) - - def __div__(self, other, axis="columns", level=None, fill_value=None): - return self.div(other, axis, level, fill_value) - - def __rdiv__(self, other, axis="columns", level=None, fill_value=None): - return self.rdiv(other, axis, level, fill_value) - - def __neg__(self): - """Computes an element wise negative DataFrame - - Returns: - A modified DataFrame where every element is the negation of before - """ - for t in self.dtypes: - if not (is_bool_dtype(t) - or is_numeric_dtype(t) - or is_timedelta64_dtype(t)): - raise TypeError("Unary negative expects numeric dtype, not {}" - .format(t)) - - new_block_partitions = np.array([_map_partitions( - lambda df: df.__neg__(), block) - for block in self._block_partitions]) - - return DataFrame(block_partitions=new_block_partitions, - col_metadata=self._col_metadata, - row_metadata=self._row_metadata) - - def __sizeof__(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - @property - def __doc__(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - @property - def blocks(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - @property - def style(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def iat(self, axis=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - @property - def loc(self): - """Purely label-location based indexer for selection by label. - - We currently support: single label, list array, slice object - We do not support: boolean array, callable - """ - from .indexing import _Loc_Indexer - return _Loc_Indexer(self) - - @property - def is_copy(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def at(self, axis=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def ix(self, axis=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - @property - def iloc(self): - """Purely integer-location based indexing for selection by position. - - We currently support: single label, list array, slice object - We do not support: boolean array, callable - """ - from .indexing import _iLoc_Indexer - return _iLoc_Indexer(self) - - def _copartition(self, other, new_index): - """Colocates the values of other with this for certain operations. - - NOTE: This method uses the indexes of each DataFrame to order them the - same. This operation does an implicit shuffling of data and zips - the two DataFrames together to be operated on. - - Args: - other: The other DataFrame to copartition with. - - Returns: - Two new sets of partitions, copartitioned and zipped. - """ - # Put in the object store so they aren't serialized each iteration. - old_self_index = ray.put(self.index) - new_index = ray.put(new_index) - old_other_index = ray.put(other.index) - - new_num_partitions = max(len(self._block_partitions.T), - len(other._block_partitions.T)) - - new_partitions_self = \ - np.array([_reindex_helper._submit( - args=tuple([old_self_index, new_index, 1, - new_num_partitions] + block.tolist()), - num_return_vals=new_num_partitions) - for block in self._block_partitions.T]).T - - new_partitions_other = \ - np.array([_reindex_helper._submit( - args=tuple([old_other_index, new_index, 1, - new_num_partitions] + block.tolist()), - num_return_vals=new_num_partitions) - for block in other._block_partitions.T]).T - - return zip(new_partitions_self, new_partitions_other) - - def _operator_helper(self, func, other, axis, level, *args): - """Helper method for inter-dataframe and scalar operations""" - if isinstance(other, DataFrame): - return self._inter_df_op_helper( - lambda x, y: func(x, y, axis, level, *args), - other, "outer", axis, level) - else: - return self._single_df_op_helper( - lambda df: func(df, other, axis, level, *args), - other, axis, level) - - def _inter_df_op_helper(self, func, other, how, axis, level, - inplace=False): - if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") - axis = pandas.DataFrame()._get_axis_number(axis) - - new_column_index = self.columns.join(other.columns, how=how) - new_index = self.index.join(other.index, how=how) - copartitions = self._copartition(other, new_index) - - new_blocks = \ - np.array([_co_op_helper._submit( - args=tuple([func, self.columns, other.columns, - len(part[0]), None] + - np.concatenate(part).tolist()), - num_return_vals=len(part[0])) - for part in copartitions]) - - if not inplace: - # TODO join the Index Metadata objects together for performance. - return DataFrame(block_partitions=new_blocks, - columns=new_column_index, - index=new_index) - else: - self._update_inplace(block_partitions=new_blocks, - columns=new_column_index, - index=new_index) - - def _single_df_op_helper(self, func, other, axis, level): - if level is not None: - raise NotImplementedError("Multilevel index not yet supported " - "in Pandas on Ray") - axis = pandas.DataFrame()._get_axis_number(axis) - - if is_list_like(other): - new_index = self.index - new_column_index = self.columns - new_col_metadata = self._col_metadata - new_row_metadata = self._row_metadata - new_blocks = None - - if axis == 0: - if len(other) != len(self.index): - raise ValueError( - "Unable to coerce to Series, length must be {0}: " - "given {1}".format(len(self.index), len(other))) - new_columns = _map_partitions(func, self._col_partitions) - new_rows = None - else: - if len(other) != len(self.columns): - raise ValueError( - "Unable to coerce to Series, length must be {0}: " - "given {1}".format(len(self.columns), len(other))) - new_rows = _map_partitions(func, self._row_partitions) - new_columns = None - - else: - new_blocks = np.array([_map_partitions(func, block) - for block in self._block_partitions]) - new_columns = None - new_rows = None - new_index = self.index - new_column_index = self.columns - new_col_metadata = self._col_metadata - new_row_metadata = self._row_metadata - - return DataFrame(col_partitions=new_columns, - row_partitions=new_rows, - block_partitions=new_blocks, - index=new_index, - columns=new_column_index, - col_metadata=new_col_metadata, - row_metadata=new_row_metadata) - - -@ray.remote -def _merge_columns(left_columns, right_columns, *args): - """Merge two columns to get the correct column names and order. - - Args: - left_columns: The columns on the left side of the merge. - right_columns: The columns on the right side of the merge. - args: The arguments for the merge. - - Returns: - The columns for the merge operation. - """ - return pandas.DataFrame(columns=left_columns, index=[0], dtype='uint8') \ - .merge(pandas.DataFrame(columns=right_columns, index=[0], - dtype='uint8'), *args).columns - - -@ray.remote -def _where_helper(left, cond, other, left_columns, cond_columns, - other_columns, *args): - - left = pandas.concat(ray.get(left.tolist()), axis=1, copy=False) - # We have to reset the index and columns here because we are coming - # from blocks and the axes are set according to the blocks. We have - # already correctly copartitioned everything, so there's no - # correctness problems with doing this. - left.reset_index(inplace=True, drop=True) - left.columns = left_columns - - cond = pandas.concat(ray.get(cond.tolist()), axis=1, copy=False) - cond.reset_index(inplace=True, drop=True) - cond.columns = cond_columns - - if isinstance(other, np.ndarray): - other = pandas.concat(ray.get(other.tolist()), axis=1, copy=False) - other.reset_index(inplace=True, drop=True) - other.columns = other_columns - - return left.where(cond, other, *args) - - -@ray.remote -def reindex_helper(old_index, new_index, axis, npartitions, method, fill_value, - limit, tolerance, *df): - df = pandas.concat(df, axis=axis ^ 1, copy=False) - if axis == 1: - df.index = old_index - else: - df.columns = old_index - - df = df.reindex(new_index, copy=False, axis=axis ^ 1, - method=method, fill_value=fill_value, - limit=limit, tolerance=tolerance) - return create_blocks_helper(df, npartitions, axis) - - -@ray.remote -def _equals_helper(left, right): - right = pandas.concat(ray.get(right.tolist()), axis=1, copy=False) - left = pandas.concat(ray.get(left.tolist()), axis=1, copy=False) - # Since we know that the index and columns match, we can just check the - # values. We can't use np.array_equal here because it doesn't recognize - # np.nan as equal to another np.nan - try: - assert_equal(left.values, right.values) - except AssertionError: - return False - return True diff --git a/python/ray/dataframe/datetimes.py b/python/ray/dataframe/datetimes.py deleted file mode 100644 index 48a437ceb..000000000 --- a/python/ray/dataframe/datetimes.py +++ /dev/null @@ -1,64 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import pandas -import ray - -from .dataframe import DataFrame -from .utils import _map_partitions - - -def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, - box=True, format=None, exact=True, unit=None, - infer_datetime_format=False, origin='unix'): - """Convert the arg to datetime format. If not Ray DataFrame, this falls - back on pandas. - - Args: - errors ('raise' or 'ignore'): If 'ignore', errors are silenced. - dayfirst (bool): Date format is passed in as day first. - yearfirst (bool): Date format is passed in as year first. - utc (bool): retuns a UTC DatetimeIndex if True. - box (bool): If True, returns a DatetimeIndex. - format (string): strftime to parse time, eg "%d/%m/%Y". - exact (bool): If True, require an exact format match. - unit (string, default 'ns'): unit of the arg. - infer_datetime_format (bool): Whether or not to infer the format. - origin (string): Define the reference date. - - Returns: - Type depends on input: - - - list-like: DatetimeIndex - - Series: Series of datetime64 dtype - - scalar: Timestamp - """ - if not isinstance(arg, DataFrame): - return pandas.to_datetime(arg, errors=errors, dayfirst=dayfirst, - yearfirst=yearfirst, utc=utc, box=box, - format=format, exact=exact, unit=unit, - infer_datetime_format=infer_datetime_format, - origin=origin) - if errors == 'raise': - pandas.to_datetime(pandas.DataFrame(columns=arg.columns), - errors=errors, dayfirst=dayfirst, - yearfirst=yearfirst, utc=utc, box=box, - format=format, exact=exact, unit=unit, - infer_datetime_format=infer_datetime_format, - origin=origin) - - def datetime_helper(df, cols): - df.columns = cols - return pandas.to_datetime(df, errors=errors, dayfirst=dayfirst, - yearfirst=yearfirst, utc=utc, box=box, - format=format, exact=exact, unit=unit, - infer_datetime_format=infer_datetime_format, - origin=origin) - - datetime_series = _map_partitions(datetime_helper, arg._row_partitions, - arg.columns) - result = pandas.concat(ray.get(datetime_series), copy=False) - result.index = arg.index - - return result diff --git a/python/ray/dataframe/groupby.py b/python/ray/dataframe/groupby.py deleted file mode 100644 index 39862d94d..000000000 --- a/python/ray/dataframe/groupby.py +++ /dev/null @@ -1,581 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import pandas -import numpy as np -import pandas.core.groupby -from pandas.core.dtypes.common import is_list_like -import pandas.core.common as com - -import ray - -from .utils import _inherit_docstrings, _reindex_helper -from .concat import concat -from .index_metadata import _IndexMetadata - - -@_inherit_docstrings(pandas.core.groupby.DataFrameGroupBy, - excluded=[pandas.core.groupby.DataFrameGroupBy, - pandas.core.groupby.DataFrameGroupBy.__init__]) -class DataFrameGroupBy(object): - - def __init__(self, df, by, axis, level, as_index, sort, group_keys, - squeeze, **kwargs): - - self._columns = df.columns - self._index = df.index - self._axis = axis - - self._row_metadata = df._row_metadata - self._col_metadata = df._col_metadata - - if axis == 0: - partitions = [column for column in df._block_partitions.T] - self._index_grouped = \ - pandas.Series(self._index, index=self._index) \ - .groupby(by=by, sort=sort) - else: - partitions = [row for row in df._block_partitions] - self._index_grouped = \ - pandas.Series(self._columns, index=self._columns) \ - .groupby(by=by, sort=sort) - - self._keys_and_values = [(k, v) - for k, v in self._index_grouped] - - if len(self) > 1: - self._grouped_partitions = \ - list(zip(*(groupby._submit(args=(by, - axis, - level, - as_index, - sort, - group_keys, - squeeze) - + tuple(part.tolist()), - num_return_vals=len(self)) - for part in partitions))) - else: - if axis == 0: - self._grouped_partitions = [df._col_partitions] - else: - self._grouped_partitions = [df._row_partitions] - - def __getattr__(self, key): - """Afer regular attribute access, looks up the name in the columns - - Args: - key (str): Attribute name. - - Returns: - The value of the attribute. - """ - try: - return object.__getattribute__(self, key) - except AttributeError as e: - if key in self._columns: - raise NotImplementedError( - "SeriesGroupBy is not implemented." - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - raise e - - @property - def _iter(self): - from .dataframe import DataFrame - - if self._axis == 0: - return [(self._keys_and_values[i][0], - DataFrame(col_partitions=part, - columns=self._columns, - index=self._keys_and_values[i][1].index, - col_metadata=self._col_metadata)) - for i, part in enumerate(self._grouped_partitions)] - else: - return [(self._keys_and_values[i][0], - DataFrame(row_partitions=part, - columns=self._keys_and_values[i][1].index, - index=self._index, - row_metadata=self._row_metadata)) - for i, part in enumerate(self._grouped_partitions)] - - @property - def ngroups(self): - return len(self) - - def skew(self, **kwargs): - return self._apply_agg_function(lambda df: df.skew(axis=self._axis, - **kwargs)) - - def ffill(self, limit=None): - return self._apply_df_function(lambda df: df.ffill(axis=self._axis, - limit=limit)) - - def sem(self, ddof=1): - return self._apply_agg_function(lambda df: df.sem(axis=self._axis, - ddof=ddof)) - - def mean(self, *args, **kwargs): - return self._apply_agg_function(lambda df: df.mean(axis=self._axis, - *args, - **kwargs)) - - def any(self): - return self._apply_agg_function(lambda df: df.any(axis=self._axis)) - - @property - def plot(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def ohlc(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __bytes__(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - @property - def tshift(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - @property - def groups(self): - return {k: pandas.Index(v) for k, v in self._keys_and_values} - - def min(self, **kwargs): - return self._apply_agg_function(lambda df: df.min(axis=self._axis, - **kwargs)) - - def idxmax(self): - def idxmax_helper(df, index): - result = df.idxmax(axis=self._axis) - result = result.apply(lambda v: index[v]) - return result - - results = [idxmax_helper(g[1], i[1]) - for g, i in zip(self._iter, self._index_grouped)] - - new_df = concat(results, axis=1) - if self._axis == 0: - new_df = new_df.T - new_df.columns = self._columns - new_df.index = [k for k, v in self._iter] - else: - new_df.columns = [k for k, v in self._iter] - new_df.index = self._index - return new_df - - @property - def ndim(self): - return 2 # ndim is always 2 for DataFrames - - def shift(self, periods=1, freq=None, axis=0): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def nth(self, n, dropna=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def cumsum(self, axis=0, *args, **kwargs): - return self._apply_df_function(lambda df: df.cumsum(axis, - *args, - **kwargs)) - - @property - def indices(self): - return dict(self._keys_and_values) - - def pct_change(self): - return self._apply_agg_function( - lambda df: df.pct_change(axis=self._axis)) - - def filter(self, func, dropna=True, *args, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def cummax(self, axis=0, **kwargs): - return self._apply_df_function(lambda df: df.cummax(axis, - **kwargs)) - - def apply(self, func, *args, **kwargs): - def apply_helper(df): - return df.apply(func, axis=self._axis, *args, **kwargs) - - result = [func(v) for k, v in self._iter] - if self._axis == 0: - if isinstance(result[0], pandas.Series): - # Applied an aggregation function - new_df = concat(result, axis=1).T - new_df.columns = self._columns - new_df.index = [k for k, v in self._iter] - else: - new_df = concat(result, axis=self._axis) - new_df._block_partitions = np.array([_reindex_helper._submit( - args=tuple([new_df.index, self._index, self._axis ^ 1, - len(new_df._block_partitions)] - + block.tolist()), - num_return_vals=len(new_df._block_partitions)) - for block in new_df._block_partitions.T]).T - new_df.index = self._index - new_df._row_metadata = \ - _IndexMetadata(new_df._block_partitions[:, 0], - index=new_df.index, axis=0) - else: - if isinstance(result[0], pandas.Series): - # Applied an aggregation function - new_df = concat(result, axis=1) - new_df.columns = [k for k, v in self._iter] - new_df.index = self._index - else: - new_df = concat(result, axis=self._axis) - new_df._block_partitions = np.array([_reindex_helper._submit( - args=tuple([new_df.columns, self._columns, self._axis ^ 1, - new_df._block_partitions.shape[1]] - + block.tolist()), - num_return_vals=new_df._block_partitions.shape[1]) - for block in new_df._block_partitions]) - new_df.columns = self._columns - new_df._col_metadata = \ - _IndexMetadata(new_df._block_partitions[0, :], - index=new_df.columns, axis=1) - return new_df - - @property - def dtypes(self): - if self._axis == 1: - raise ValueError("Cannot call dtypes on groupby with axis=1") - return self._apply_agg_function(lambda df: df.dtypes) - - def first(self, **kwargs): - return self._apply_agg_function(lambda df: df.first(offset=0, - **kwargs)) - - def backfill(self, limit=None): - return self.bfill(limit) - - def __getitem__(self, key): - # This operation requires a SeriesGroupBy Object - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def cummin(self, axis=0, **kwargs): - return self._apply_df_function(lambda df: df.cummin(axis=axis, - **kwargs)) - - def bfill(self, limit=None): - return self._apply_df_function(lambda df: df.bfill(axis=self._axis, - limit=limit)) - - def idxmin(self): - def idxmin_helper(df, index): - result = df.idxmin(axis=self._axis) - result = result.apply(lambda v: index[v]) - return result - - results = [idxmin_helper(g[1], i[1]) - for g, i in zip(self._iter, self._index_grouped)] - - new_df = concat(results, axis=1) - if self._axis == 0: - new_df = new_df.T - new_df.columns = self._columns - new_df.index = [k for k, v in self._iter] - else: - new_df.columns = [k for k, v in self._iter] - new_df.index = self._index - return new_df - - def prod(self, **kwargs): - return self._apply_agg_function(lambda df: df.prod(axis=self._axis, - **kwargs)) - - def std(self, ddof=1, *args, **kwargs): - return self._apply_agg_function(lambda df: df.std(axis=self._axis, - ddof=ddof, - *args, - **kwargs)) - - def aggregate(self, arg, *args, **kwargs): - if self._axis != 0: - # This is not implemented in pandas, - # so we throw a different message - raise NotImplementedError("axis other than 0 is not supported") - - if is_list_like(arg): - raise NotImplementedError( - "This requires Multi-level index to be implemented. " - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - return self._apply_agg_function(lambda df: df.agg(arg, - axis=self._axis, - *args, - **kwargs)) - - def last(self, **kwargs): - return self._apply_df_function(lambda df: df.last(offset=0, - **kwargs)) - - def mad(self): - return self._apply_agg_function(lambda df: df.mad()) - - def rank(self): - return self._apply_df_function(lambda df: df.rank(axis=self._axis)) - - @property - def corrwith(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def pad(self, limit=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def max(self, **kwargs): - return self._apply_agg_function(lambda df: df.max(axis=self._axis, - **kwargs)) - - def var(self, ddof=1, *args, **kwargs): - return self._apply_agg_function(lambda df: df.var(ddof=ddof, - axis=self._axis, - *args, - **kwargs)) - - def get_group(self, name, obj=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def __len__(self): - return len(self._keys_and_values) - - def all(self): - return self._apply_agg_function(lambda df: df.all()) - - def size(self): - return self._apply_agg_function(lambda df: df.size) - - def sum(self, **kwargs): - return self._apply_agg_function(lambda df: - df.sum(axis=self._axis, **kwargs)) - - def __unicode__(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def describe(self, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def boxplot(self, grouped, subplots=True, column=None, fontsize=None, - rot=0, grid=True, ax=None, figsize=None, layout=None, **kwds): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def ngroup(self, ascending=True): - return self._index_grouped.ngroup(ascending) - - def nunique(self, dropna=True): - return self._apply_agg_function(lambda df: df.nunique(dropna=dropna, - axis=self._axis)) - - def resample(self, rule, *args, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def median(self, **kwargs): - return self._apply_agg_function(lambda df: df.median(axis=self._axis, - **kwargs)) - - def head(self, n=5): - result = [v.head(n) for k, v in self._iter] - new_df = concat(result, axis=self._axis) - - if self._axis == 0: - index_head = [v[:n] for k, v in self._keys_and_values] - flattened_index = {i for j in index_head for i in j} - sorted_index = [i for i in self._index if i in flattened_index] - new_df._block_partitions = np.array([_reindex_helper._submit( - args=tuple([new_df.index, sorted_index, 1, - len(new_df._block_partitions)] + block.tolist()), - num_return_vals=len(new_df._block_partitions)) - for block in new_df._block_partitions.T]).T - new_df.index = sorted_index - new_df._row_metadata = \ - _IndexMetadata(new_df._block_partitions[:, 0], - index=new_df.index, axis=0) - - return new_df - - def cumprod(self, axis=0, *args, **kwargs): - return self._apply_df_function(lambda df: df.cumprod(axis, - *args, - **kwargs)) - - def __iter__(self): - return self._iter.__iter__() - - def agg(self, arg, *args, **kwargs): - return self.aggregate(arg, *args, **kwargs) - - def cov(self): - return self._apply_agg_function(lambda df: df.cov()) - - def transform(self, func, *args, **kwargs): - return self._apply_df_function(lambda df: df.transform(func, - *args, - **kwargs)) - - def corr(self, **kwargs): - return self._apply_agg_function(lambda df: df.corr(**kwargs)) - - def fillna(self, **kwargs): - return self._apply_df_function(lambda df: df.fillna(axis=self._axis, - **kwargs)) - - def count(self, **kwargs): - return self._apply_agg_function(lambda df: df.count(self._axis, - **kwargs)) - - def pipe(self, func, *args, **kwargs): - return com._pipe(self, func, *args, **kwargs) - - def cumcount(self, ascending=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def tail(self, n=5): - result = [v.tail(n) for k, v in self._iter] - new_df = concat(result, axis=self._axis) - - if self._axis == 0: - index_tail = [v[-n:] for k, v in self._keys_and_values] - flattened_index = {i for j in index_tail for i in j} - sorted_index = [i for i in self._index if i in flattened_index] - new_df._block_partitions = np.array([_reindex_helper._submit( - args=tuple([new_df.index, sorted_index, 1, - len(new_df._block_partitions)] + block.tolist()), - num_return_vals=len(new_df._block_partitions)) - for block in new_df._block_partitions.T]).T - new_df.index = sorted_index - new_df._row_metadata = \ - _IndexMetadata(new_df._block_partitions[:, 0], - index=new_df.index, axis=0) - - return new_df - - # expanding and rolling are unique cases and need to likely be handled - # separately. They do not appear to be commonly used. - def expanding(self, *args, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def rolling(self, *args, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def hist(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def quantile(self, q=0.5, **kwargs): - if is_list_like(q): - raise NotImplementedError( - "This requires Multi-level index to be implemented. " - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - return self._apply_agg_function(lambda df: df.quantile(q=q, - axis=self._axis, - **kwargs)) - - def diff(self): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - def take(self, **kwargs): - return self._apply_df_function(lambda df: df.take(**kwargs)) - - def _apply_agg_function(self, f): - assert callable(f), "\'{0}\' object is not callable".format(type(f)) - - result = [f(v) for k, v in self._iter] - new_df = concat(result, axis=1) - - if self._axis == 0: - new_df = new_df.T - new_df.columns = self._columns - new_df.index = [k for k, v in self._iter] - else: - new_df.columns = [k for k, v in self._iter] - new_df.index = self._index - return new_df - - def _apply_df_function(self, f, concat_axis=None): - assert callable(f), "\'{0}\' object is not callable".format(type(f)) - - result = [f(v) for k, v in self._iter] - concat_axis = self._axis if concat_axis is None else concat_axis - - new_df = concat(result, axis=concat_axis) - - if self._axis == 0: - new_df._block_partitions = np.array([_reindex_helper._submit( - args=tuple([new_df.index, self._index, 1, - len(new_df._block_partitions)] + block.tolist()), - num_return_vals=len(new_df._block_partitions)) - for block in new_df._block_partitions.T]).T - new_df.index = self._index - new_df._row_metadata = \ - _IndexMetadata(new_df._block_partitions[:, 0], - index=new_df.index, axis=0) - else: - new_df._block_partitions = np.array([_reindex_helper._submit( - args=tuple([new_df.columns, self._columns, 0, - new_df._block_partitions.shape[1]] - + block.tolist()), - num_return_vals=new_df._block_partitions.shape[1]) - for block in new_df._block_partitions]) - new_df.columns = self._columns - new_df._col_metadata = \ - _IndexMetadata(new_df._block_partitions[0, :], - index=new_df.columns, axis=1) - - return new_df - - -@ray.remote -def groupby(by, axis, level, as_index, sort, group_keys, squeeze, *df): - - df = pandas.concat(df, axis=axis) - - return [v for k, v in df.groupby(by=by, - axis=axis, - level=level, - as_index=as_index, - sort=sort, - group_keys=group_keys, - squeeze=squeeze)] diff --git a/python/ray/dataframe/index_metadata.py b/python/ray/dataframe/index_metadata.py deleted file mode 100644 index 62bea3f0e..000000000 --- a/python/ray/dataframe/index_metadata.py +++ /dev/null @@ -1,415 +0,0 @@ -import pandas -import numpy as np -import ray - -from .utils import ( - _build_row_lengths, - _build_col_widths, - _build_coord_df) - -from pandas.core.indexing import convert_to_index_sliceable - - -class _IndexMetadata(object): - """Wrapper for Pandas indexes in Ray DataFrames. Handles all of the - metadata specific to the axis of partition (setting indexes, - calculating the index within partition of a value, etc.). This - implementation assumes the underlying index lies across multiple - partitions. - - IMPORTANT NOTE: Currently all operations, as implemented, are inplace. - - WARNING: Currently, the `_lengths` item is the source of truth for an - _IndexMetadata object, since it is easy to manage, and that the coord_df - item may be deprecated in the future. As such, it is _very_ important that - any functions that mutate the coord_df splits in anyway first modify the - lengths. Otherwise bad things might happen! - """ - - def __init__(self, dfs=None, index=None, axis=0, lengths_oid=None, - coord_df_oid=None): - """Inits a IndexMetadata from Ray DataFrame partitions - - Args: - dfs ([ObjectID]): ObjectIDs of dataframe partitions - index (pandas.Index): Index of the Ray DataFrame. - axis: Axis of partition (0=row partitions, 1=column partitions) - - Returns: - A IndexMetadata backed by the specified pandas.Index, partitioned - off specified partitions - """ - assert (lengths_oid is None) == (coord_df_oid is None), \ - "Must pass both or neither of lengths_oid and coord_df_oid" - - if dfs is not None and lengths_oid is None: - if axis == 0: - lengths_oid = _build_row_lengths.remote(dfs) - else: - lengths_oid = _build_col_widths.remote(dfs) - coord_df_oid = _build_coord_df.remote(lengths_oid, index) - - self._lengths = lengths_oid - self._coord_df = coord_df_oid - self._index_cache = index - self._cached_index = False - - def _get__lengths(self): - if isinstance(self._lengths_cache, ray.ObjectID) or \ - (isinstance(self._lengths_cache, list) and - isinstance(self._lengths_cache[0], ray.ObjectID)): - self._lengths_cache = ray.get(self._lengths_cache) - return self._lengths_cache - - def _set__lengths(self, lengths): - self._lengths_cache = lengths - - _lengths = property(_get__lengths, _set__lengths) - - def _get__coord_df(self): - """Get the coordinate dataframe wrapped by this _IndexMetadata. - - Since we may have had an index set before our coord_df was - materialized, we'll have to apply it to the newly materialized df - """ - if isinstance(self._coord_df_cache, ray.ObjectID): - self._coord_df_cache = ray.get(self._coord_df_cache) - if self._cached_index: - self._coord_df_cache.index = self._index_cache - self._cached_index = False - return self._coord_df_cache - - def _set__coord_df(self, coord_df): - """Set the coordinate dataframe wrapped by this _IndexMetadata. - - Sometimes we set the _IndexMetadata's coord_df outside of the - constructor, generally using fxns like drop(). This produces a modified - index, so we need to reflect the change on the index cache. - - If the set _IndexMetadata is an OID instead (due to a copy or whatever - reason), we fall back relying on `_index_cache`. - """ - if not isinstance(coord_df, ray.ObjectID): - self._index_cache = coord_df.index - self._coord_df_cache = coord_df - - _coord_df = property(_get__coord_df, _set__coord_df) - - def _get_index(self): - """Get the index wrapped by this _IndexMetadata. - - The only time `self._index_cache` would be None is in a newly created - _IndexMetadata object without a specified `index` parameter (See the - _IndexMetadata constructor for more details) - """ - if isinstance(self._coord_df_cache, ray.ObjectID): - return self._index_cache - else: - return self._coord_df_cache.index - - def _set_index(self, new_index): - """Set the index wrapped by this _IndexMetadata. - - It is important to always set `_index_cache` even if the coord_df is - materialized due to the possibility that it is set to an OID later on. - This design is more straightforward than caching indexes on setting the - coord_df to an OID due to the possibility of an OID-to-OID change. - """ - new_index = pandas.DataFrame(index=new_index).index - assert len(new_index) == len(self) - - self._index_cache = new_index - if isinstance(self._coord_df_cache, ray.ObjectID): - self._cached_index = True - else: - self._coord_df_cache.index = new_index - - index = property(_get_index, _set_index) - - def _get_index_cache(self): - """Get the cached Index object, which may sometimes be an OID. - - This will ray.get the Index object out of the Ray store lazily, such - that it is not grabbed until it is needed in the driver. This layer of - abstraction is important for allowing this object to be instantiated - with a remote Index object. - - Returns: - The Index object in _index_cache. - """ - if self._index_cache_validator is None: - self._index_cache_validator = pandas.RangeIndex(len(self)) - elif isinstance(self._index_cache_validator, - ray.ObjectID): - self._index_cache_validator = ray.get(self._index_cache_validator) - - return self._index_cache_validator - - def _set_index_cache(self, new_index): - """Sets the new index cache. - - Args: - new_index: The Index to set the _index_cache to. - """ - self._index_cache_validator = new_index - - # _index_cache_validator is an extra layer of abstraction to allow the - # cache to accept ObjectIDs and ray.get them when needed. - _index_cache = property(_get_index_cache, _set_index_cache) - - def coords_of(self, key): - """Returns the coordinates (partition, index_within_partition) of the - provided key in the index. Can be called on its own or implicitly - through __getitem__ - - Args: - key: - item to get coordinates of. Can also be a tuple of item - and {"partition", "index_within_partition"} if caller only - needs one of the coordinates - - Returns: - Pandas object with the keys specified. If key is a single object - it will be a pandas.Series with items `partition` and - `index_within_partition`, and if key is a slice or if the key is - duplicate it will be a pandas.DataFrame with said items as columns. - """ - return self._coord_df.loc[key] - - def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, **kwargs): - # TODO: Find out what this does, and write a docstring - assignments_df = self._coord_df.groupby(by=by, axis=axis, level=level, - as_index=as_index, sort=sort, - group_keys=group_keys, - squeeze=squeeze, **kwargs)\ - .apply(lambda x: x[:]) - return assignments_df - - def partition_series(self, partition): - return self[self._coord_df['partition'] == partition, - 'index_within_partition'] - - def __len__(self): - return int(sum(self._lengths)) - - def reset_partition_coords(self, partitions=None): - partitions = np.array(partitions) - - for partition in partitions: - partition_mask = (self._coord_df['partition'] == partition) - # Since we are replacing columns with RangeIndex inside the - # partition, we have to make sure that our reference to it is - # upandasated as well. - try: - self._coord_df.loc[partition_mask, - 'index_within_partition'] = np.arange( - sum(partition_mask)).astype(int) - except ValueError: - # Copy the arrow sealed dataframe so we can mutate it. - # We only do this the first time we try to mutate the sealed. - self._coord_df = self._coord_df.copy() - self._coord_df.loc[partition_mask, - 'index_within_partition'] = np.arange( - sum(partition_mask)).astype(int) - - def insert(self, key, loc=None, partition=None, - index_within_partition=None): - """Inserts a key at a certain location in the index, or a certain coord - in a partition. Called with either `loc` or `partition` and - `index_within_partition`. If called with both, `loc` will be used. - - Args: - key: item to insert into index - loc: location to insert into index - partition: partition to insert into - index_within_partition: index within partition to insert into - - Returns: - DataFrame with coordinates of insert - """ - # Perform insert on a specific partition - # Determine which partition to place it in, and where in that partition - if loc is not None: - cum_lens = np.cumsum(self._lengths) - if len(cum_lens) > 1: - partition = np.digitize(loc, cum_lens[:-1], right=True) - else: - partition = 0 - if partition >= len(cum_lens): - if loc > cum_lens[-1]: - raise IndexError("index {0} is out of bounds".format(loc)) - else: - index_within_partition = self._lengths[-1] - else: - first_in_partition = \ - np.asscalar(np.concatenate(([0], cum_lens))[partition]) - index_within_partition = loc - first_in_partition - - # TODO: Stop-gap solution until we begin passing IndexMetadatas - return partition, index_within_partition - - # Generate new index - new_index = self.index.insert(loc, key) - - # Shift indices in partition where we inserted column - idx_locs = (self._coord_df.partition == partition) & \ - (self._coord_df.index_within_partition == - index_within_partition) - # TODO: Determine why self._coord_df{,_cache} are read-only - _coord_df_copy = self._coord_df.copy() - _coord_df_copy.loc[idx_locs, 'index_within_partition'] += 1 - - # TODO: Determine if there's a better way to do a row-index insert in - # pandas, because this is very annoying/unsure of efficiency - # Create new coord entry to insert - coord_to_insert = pandas.DataFrame( - {'partition': partition, - 'index_within_partition': index_within_partition}, - index=[key]) - - # Insert into cached RangeIndex, and order by new column index - self._coord_df = _coord_df_copy.append(coord_to_insert).loc[new_index] - - # Return inserted coordinate for callee - return coord_to_insert - - def get_global_indices(self, partition, index_within_partition_list): - total = 0 - for i in range(partition): - total += self._lengths[i] - - return [total + i for i in index_within_partition_list] - - def squeeze(self, partition, index_within_partition): - """Prepare a single coordinate for removal by "squeezing" the - subsequent coordinates "up" one index within that partition. To be used - with "_IndexMetadata.drop" for when all the "squeezed" coordinates are - dropped in batch. Note that this function doesn't actually mutate the - coord_df. - """ - self._coord_df = self._coord_df.copy() - - partition_mask = self._coord_df.partition == partition - index_within_partition_mask = \ - self._coord_df.index_within_partition > index_within_partition - self._coord_df.loc[partition_mask & index_within_partition_mask, - 'index_within_partition'] -= 1 - - def copy(self): - # TODO: Investigate copy-on-write wrapper for metadata objects - coord_df_copy = self._coord_df_cache - if not isinstance(self._coord_df_cache, ray.ObjectID): - coord_df_copy = self._coord_df_cache.copy() - - lengths_copy = self._lengths_cache - if not isinstance(self._lengths_cache, ray.ObjectID): - lengths_copy = self._lengths_cache.copy() - - index_copy = self._index_cache - if self._index_cache is not None: - index_copy = self._index_cache.copy() - - return _IndexMetadata(index=index_copy, - coord_df_oid=coord_df_copy, - lengths_oid=lengths_copy) - - def __getitem__(self, key): - """Returns the coordinates (partition, index_within_partition) of the - provided key in the index. Essentially just an alias for - `_IndexMetadata.coords_of` that allows for slice passing, since - slices cannot be passed with slice notation other than through - `__getitem__` calls. - - Args: - key: - item to get coordinates of. Can also be a tuple of item - and {"partition", "index_within_partition"} if caller only - needs one of the coordinates - - Returns: - Pandas object with the keys specified. If key is a single object - it will be a pandas.Series with items `partition` and - `index_within_partition`, and if key is a slice or if the key is - duplicate it will be a pandas.DataFrame with said items as columns. - """ - return self.coords_of(key) - - def first_valid_index(self): - return self._coord_df.first_valid_index() - - def last_valid_index(self): - return self._coord_df.last_valid_index() - - def drop(self, labels, errors='raise'): - """Drop the specified labels from the IndexMetadata - - Args: - labels (scalar or list-like): - The labels to drop - errors ('raise' or 'ignore'): - If 'ignore', suppress errors for when labels don't exist - - Returns: - DataFrame with coordinates of dropped labels - """ - dropped = self.coords_of(labels) - - # Upandasate first lengths to prevent possible length inconsistencies - if isinstance(dropped, pandas.DataFrame): - try: - drop_per_part = dropped.groupby(["partition"]).size()\ - .reindex(index=pandas.RangeIndex(len(self._lengths)), - fill_value=0) - except ValueError: - # Copy the arrow sealed dataframe so we can mutate it. - dropped = dropped.copy() - drop_per_part = dropped.groupby(["partition"]).size()\ - .reindex(index=pandas.RangeIndex(len(self._lengths)), - fill_value=0) - elif isinstance(dropped, pandas.Series): - drop_per_part = np.zeros_like(self._lengths) - drop_per_part[dropped["partition"]] = 1 - else: - raise AssertionError("Unrecognized result from `coords_of`") - - self._lengths = self._lengths - np.array(drop_per_part) - - new_coord_df = self._coord_df.drop(labels, errors=errors) - - num_dropped = 0 - for i, length in enumerate(self._lengths): - if length == 0: - num_dropped += 1 - if num_dropped > 0: - new_coord_df['partition'][new_coord_df['partition'] == i] \ - -= num_dropped - - self._coord_df = new_coord_df - return dropped - - def rename_index(self, mapper): - """Rename the index. - - Args: - mapper: name to rename the index as - """ - self._coord_df = self._coord_df.rename_axis(mapper, axis=0) - - def convert_to_index_sliceable(self, key): - """Converts and performs error checking on the passed slice - - Args: - key: slice to convert and check - """ - return convert_to_index_sliceable(self._coord_df, key) - - def get_partition(self, partition_id): - """Return a view of coord_df where partition = partition_id - """ - return self._coord_df[self._coord_df.partition == partition_id] - - def sorted_index(self): - return (self._coord_df - .sort_values(['partition', 'index_within_partition']) - .index) diff --git a/python/ray/dataframe/indexing.py b/python/ray/dataframe/indexing.py deleted file mode 100644 index 2fa2e1e8d..000000000 --- a/python/ray/dataframe/indexing.py +++ /dev/null @@ -1,469 +0,0 @@ -"""Indexing Helper Class works as follows: - -_Location_Indexer_Base provide methods framework for __getitem__ - and __setitem__ that work with Ray DataFrame's internal index. Base - class's __{get,set}item__ takes in partitions & idx_in_partition data - and perform lookup/item write. - -_LocIndexer and _iLocIndexer is responsible for indexer specific logic and - lookup computation. Loc will take care of enlarge dataframe. Both indexer - will take care of translating pandas's lookup to Ray DataFrame's internal - lookup. - -An illustration is available at -https://github.com/ray-project/ray/pull/1955#issuecomment-386781826 -""" -import pandas -import numpy as np -import ray -from warnings import warn - -from pandas.api.types import (is_scalar, is_list_like, is_bool) -from pandas.core.dtypes.common import is_integer -from pandas.core.indexing import IndexingError - -from .utils import (_blocks_to_col, _get_nan_block_id, extractor, - _mask_block_partitions, writer) -from .index_metadata import _IndexMetadata -from .dataframe import DataFrame - - -def is_slice(x): return isinstance(x, slice) - - -def is_2d(x): return is_list_like(x) or is_slice(x) - - -def is_tuple(x): return isinstance(x, tuple) - - -def is_boolean_array(x): return is_list_like(x) and all(map(is_bool, x)) - - -def is_integer_slice(x): - if not is_slice(x): - return False - for pos in [x.start, x.stop, x.step]: - if not ((pos is None) or is_integer(pos)): - return False # one position is neither None nor int - return True - - -_ENLARGEMENT_WARNING = """ -Passing list-likes to .loc or [] with any missing label will raise -KeyError in the future, you can use .reindex() as an alternative. - -See the documentation here: -http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike -""" - -_ILOC_INT_ONLY_ERROR = """ -Location based indexing can only have [integer, integer slice (START point is -INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types. -""" - - -def _parse_tuple(tup): - """Unpack the user input for getitem and setitem and compute ndim - - loc[a] -> ([a], :), 1D - loc[[a,b],] -> ([a,b], :), - loc[a,b] -> ([a], [b]), 0D - """ - row_loc, col_loc = slice(None), slice(None) - - if is_tuple(tup): - row_loc = tup[0] - if len(tup) == 2: - col_loc = tup[1] - if len(tup) > 2: - raise IndexingError('Too many indexers') - else: - row_loc = tup - - ndim = _compute_ndim(row_loc, col_loc) - row_loc = [row_loc] if is_scalar(row_loc) else row_loc - col_loc = [col_loc] if is_scalar(col_loc) else col_loc - - return row_loc, col_loc, ndim - - -def _is_enlargement(locator, coord_df): - """Determine if a locator will enlarge the corrd_df. - - Enlargement happens when you trying to locate using labels isn't in the - original index. In other words, enlargement == adding NaNs ! - """ - if is_list_like(locator) and not is_slice( - locator) and len(locator) > 0 and not is_boolean_array(locator): - n_diff_elems = len(pandas.Index(locator).difference(coord_df.index)) - is_enlargement_boolean = n_diff_elems > 0 - return is_enlargement_boolean - return False - - -def _warn_enlargement(): - warn(FutureWarning(_ENLARGEMENT_WARNING)) - - -def _compute_ndim(row_loc, col_loc): - """Compute the ndim of result from locators - """ - row_scaler = is_scalar(row_loc) - col_scaler = is_scalar(col_loc) - - if row_scaler and col_scaler: - ndim = 0 - elif row_scaler ^ col_scaler: - ndim = 1 - else: - ndim = 2 - - return ndim - - -class _Location_Indexer_Base(): - """Base class for location indexer like loc and iloc - """ - - def __init__(self, ray_df): - self.df = ray_df - self.col_coord_df = ray_df._col_metadata._coord_df - self.row_coord_df = ray_df._row_metadata._coord_df - self.block_oids = ray_df._block_partitions - - self.is_view = False - if isinstance(ray_df, DataFrameView): - self.block_oids = ray_df._block_partitions_data - self.is_view = True - - def __getitem__(self, row_lookup, col_lookup, ndim): - """ - Args: - row_lookup: A pandas dataframe, a partial view from row_coord_df - col_lookup: A pandas dataframe, a partial view from col_coord_df - ndim: the dimension of returned data - """ - if ndim == 2: - return self._generate_view(row_lookup, col_lookup) - - extracted = self._retrive_items(row_lookup, col_lookup) - if ndim == 1: - result = ray.get(_blocks_to_col.remote(*extracted)).squeeze() - - if is_scalar(result): - result = pandas.Series(result) - - scaler_axis = row_lookup if len(row_lookup) == 1 else col_lookup - series_name = scaler_axis.iloc[0].name - result.name = series_name - - index_axis = row_lookup if len(col_lookup) == 1 else col_lookup - result.index = index_axis.index - - if ndim == 0: - result = ray.get(extracted[0]).squeeze() - - return result - - def _retrive_items(self, row_lookup, col_lookup): - """Given lookup dataframes, return a list of result oids - """ - result_oids = [] - - # We have to copy before we groupby because - # https://github.com/pandas-dev/pandas/issues/10043 - row_groups = row_lookup.copy().groupby('partition') - col_groups = col_lookup.copy().groupby('partition') - for row_blk, row_data in row_groups: - for col_blk, col_data in col_groups: - block_oid = self.block_oids[row_blk, col_blk] - row_idx = row_data['index_within_partition'] - col_idx = col_data['index_within_partition'] - - result_oid = extractor.remote(block_oid, row_idx, col_idx) - result_oids.append(result_oid) - return result_oids - - def _generate_view(self, row_lookup, col_lookup): - """Generate a DataFrameView from lookup - """ - row_lengths = [0] * len(self.df._row_metadata._lengths) - for i in row_lookup["partition"]: - row_lengths[i] += 1 - col_lengths = [0] * len(self.df._col_metadata._lengths) - for i in col_lookup["partition"]: - col_lengths[i] += 1 - - row_metadata_view = _IndexMetadata( - coord_df_oid=row_lookup, lengths_oid=row_lengths) - - col_metadata_view = _IndexMetadata( - coord_df_oid=col_lookup, lengths_oid=col_lengths) - - df_view = DataFrameView( - block_partitions=self.block_oids, - row_metadata=row_metadata_view, - col_metadata=col_metadata_view, - index=row_metadata_view.index, - columns=col_metadata_view.index) - - return df_view - - def __setitem__(self, row_lookup, col_lookup, item): - """ - Args: - row_lookup: A pandas dataframe, a partial view from row_coord_df - col_lookup: A pandas dataframe, a partial view from col_coord_df - item: The new item needs to be set. It can be any shape that's - broadcastable to the product of the lookup tables. - """ - to_shape = (len(row_lookup), len(col_lookup)) - item = self._broadcast_item(item, to_shape) - self._write_items(row_lookup, col_lookup, item) - - def _broadcast_item(self, item, to_shape): - """Use numpy to broadcast or reshape item. - - Notes: - - Numpy is memory efficent, there shouldn't be performance issue. - """ - try: - item = np.array(item) - if np.prod(to_shape) == np.prod(item.shape): - return item.reshape(to_shape) - else: - return np.broadcast_to(item, to_shape) - except ValueError: - from_shape = np.array(item).shape - raise ValueError( - "could not broadcast input array from \ - shape {from_shape} into shape {to_shape}".format( - from_shape=from_shape, to_shape=to_shape)) - - def _write_items(self, row_lookup, col_lookup, item): - """Perform remote write and replace blocks. - """ - - # We have to copy before we groupby because - # https://github.com/pandas-dev/pandas/issues/10043 - row_groups = row_lookup.copy().groupby('partition') - col_groups = col_lookup.copy().groupby('partition') - - row_item_index = 0 - for row_blk, row_data in row_groups: - row_len = len(row_data) - - col_item_index = 0 - for col_blk, col_data in col_groups: - col_len = len(col_data) - - block_oid = self.block_oids[row_blk, col_blk] - row_idx = row_data['index_within_partition'] - col_idx = col_data['index_within_partition'] - - item_to_write = item[row_item_index:row_item_index + row_len, - col_item_index:col_item_index + col_len] - - result_oid = writer.remote(block_oid, row_idx, col_idx, - item_to_write) - - if self.is_view: - self.df._block_partitions_data[row_blk, - col_blk] = result_oid - else: - self.df._block_partitions[row_blk, col_blk] = result_oid - - col_item_index += col_len - row_item_index += row_len - - -class _Loc_Indexer(_Location_Indexer_Base): - """A indexer for ray_df.loc[] functionality""" - - def __getitem__(self, key): - row_loc, col_loc, ndim = _parse_tuple(key) - self._handle_enlargement(row_loc, col_loc) - row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) - ndim = self._expand_dim(row_lookup, col_lookup, ndim) - result = super(_Loc_Indexer, self).__getitem__(row_lookup, col_lookup, - ndim) - return result - - def __setitem__(self, key, item): - row_loc, col_loc, _ = _parse_tuple(key) - self._handle_enlargement(row_loc, col_loc) - row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) - super(_Loc_Indexer, self).__setitem__(row_lookup, col_lookup, - item) - - def _handle_enlargement(self, row_loc, col_loc): - """Handle Enlargement (if there is one). - - Returns: - None - """ - locators = [row_loc, col_loc] - coord_dfs = [self.row_coord_df, self.col_coord_df] - axis = ['row', 'col'] - metadata = {'row': self.df._row_metadata, 'col': self.df._col_metadata} - - for loc, coord, axis in zip(locators, coord_dfs, axis): - if _is_enlargement(loc, coord): - new_meta = self._enlarge_axis(loc, axis=axis) - _warn_enlargement() - metadata[axis] = new_meta - - self.row_coord_df = metadata['row']._coord_df - self.col_coord_df = metadata['col']._coord_df - - def _enlarge_axis(self, locator, axis): - """Add rows/columns to block partitions according to locator. - - Returns: - metadata (_IndexMetadata) - """ - # 1. Prepare variables - row_based_bool = axis == 'row' - # major == the axis of the locator - major_meta = self.df._row_metadata if row_based_bool \ - else self.df._col_metadata - minor_meta = self.df._col_metadata if row_based_bool \ - else self.df._row_metadata - - # 2. Compute the nan labels and add blocks - nan_labels = self._compute_enlarge_labels(locator, major_meta.index) - num_nan_labels = len(nan_labels) - blk_part_n_row, blk_part_n_col = self.block_oids.shape - - nan_blk_lens = minor_meta._lengths - nan_blks = np.array([[ - _get_nan_block_id( - num_nan_labels, n_cols, transpose=not row_based_bool) - for n_cols in nan_blk_lens - ]]) - nan_blks = nan_blks.T if not row_based_bool else nan_blks - - self.block_oids = np.concatenate( - [self.block_oids, nan_blks], axis=0 if row_based_bool else 1) - - # 3. Prepare metadata to return - nan_coord_df = pandas.DataFrame(data=[{ - '': name, - 'partition': blk_part_n_row if row_based_bool else blk_part_n_col, - 'index_within_partition': i - } for name, i in zip(nan_labels, np.arange(num_nan_labels)) - ]).set_index('') - - coord_df = pandas.concat([major_meta._coord_df, nan_coord_df]) - coord_df = coord_df.loc[locator] # Re-index that allows duplicates - - lens = major_meta._lengths - lens = np.concatenate([lens, np.array([num_nan_labels])]) - - metadata_view = _IndexMetadata(coord_df_oid=coord_df, lengths_oid=lens) - return metadata_view - - def _compute_enlarge_labels(self, locator, base_index): - """Helper for _enlarge_axis, compute common labels and extra labels. - - Returns: - nan_labels: The labels needs to be added - """ - # base_index_type can be pd.Index or pd.DatetimeIndex - # depending on user input and pandas behavior - # See issue #2264 - base_index_type = type(base_index) - locator_as_index = base_index_type(locator) - - nan_labels = locator_as_index.difference(base_index) - common_labels = locator_as_index.intersection(base_index) - - if len(common_labels) == 0: - raise KeyError( - 'None of [{labels}] are in the [{base_index_name}]'.format( - labels=list(locator_as_index), base_index_name=base_index)) - - return nan_labels - - def _expand_dim(self, row_lookup, col_lookup, ndim): - """Expand the dimension if necessary. - This method is for cases like duplicate labels. - """ - many_rows = len(row_lookup) > 1 - many_cols = len(col_lookup) > 1 - - if ndim == 0 and (many_rows or many_cols): - ndim = 1 - if ndim == 1 and (many_rows and many_cols): - ndim = 2 - - return ndim - - def _compute_lookup(self, row_loc, col_loc): - # We use reindex for list to avoid duplicates. - row_lookup = self.row_coord_df.loc[row_loc] - col_lookup = self.col_coord_df.loc[col_loc] - return row_lookup, col_lookup - - -class _iLoc_Indexer(_Location_Indexer_Base): - """A indexer for ray_df.iloc[] functionality""" - - def __getitem__(self, key): - row_loc, col_loc, ndim = _parse_tuple(key) - - self._check_dtypes(row_loc) - self._check_dtypes(col_loc) - - row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) - result = super(_iLoc_Indexer, self).__getitem__( - row_lookup, col_lookup, ndim) - return result - - def __setitem__(self, key, item): - row_loc, col_loc, _ = _parse_tuple(key) - - self._check_dtypes(row_loc) - self._check_dtypes(col_loc) - - row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) - super(_iLoc_Indexer, self).__setitem__( - row_lookup, col_lookup, item) - - def _compute_lookup(self, row_loc, col_loc): - # We use reindex for list to avoid duplicates. - return self.row_coord_df.iloc[row_loc], self.col_coord_df.iloc[col_loc] - - def _check_dtypes(self, locator): - is_int = is_integer(locator) - is_int_slice = is_integer_slice(locator) - is_int_list = is_list_like(locator) and all(map(is_integer, locator)) - is_bool_arr = is_boolean_array(locator) - - if not any([is_int, is_int_slice, is_int_list, is_bool_arr]): - raise ValueError(_ILOC_INT_ONLY_ERROR) - - -class DataFrameView(DataFrame): - """A subclass of DataFrame where the index can be smaller than blocks. - """ - - def __init__(self, block_partitions, row_metadata, col_metadata, index, - columns): - self._block_partitions = block_partitions - self._row_metadata = row_metadata - self._col_metadata = col_metadata - self.index = index - self.columns = columns - - def _get_block_partitions(self): - oid_arr = _mask_block_partitions(self._block_partitions_data, - self._row_metadata, - self._col_metadata) - return oid_arr - - def _set_block_partitions(self, new_block_partitions): - self._block_partitions_data = new_block_partitions - - _block_partitions = property(_get_block_partitions, _set_block_partitions) diff --git a/python/ray/dataframe/io.py b/python/ray/dataframe/io.py deleted file mode 100644 index 8b1bdbb4e..000000000 --- a/python/ray/dataframe/io.py +++ /dev/null @@ -1,524 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from itertools import chain -from io import BytesIO -import os -import re -import warnings -import pandas - -from pyarrow.parquet import ParquetFile -from pandas.io.common import _infer_compression # don't depend on internal API - - -from .dataframe import ray, DataFrame -from . import get_npartitions -from .utils import from_pandas - - -# Parquet -def read_parquet(path, engine='auto', columns=None, **kwargs): - """Load a parquet object from the file path, returning a DataFrame. - Ray DataFrame only supports pyarrow engine for now. - - Args: - path: The filepath of the parquet file. - We only support local files for now. - engine: Ray only support pyarrow reader. - This argument doesn't do anything for now. - kwargs: Pass into parquet's read_row_group function. - """ - pf = ParquetFile(path) - - n_rows = pf.metadata.num_rows - chunksize = n_rows // get_npartitions() - n_row_groups = pf.metadata.num_row_groups - - idx_regex = re.compile('__index_level_\d+__') - columns = [ - name for name in pf.metadata.schema.names if not idx_regex.match(name) - ] - - df_from_row_groups = [ - _read_parquet_row_group.remote(path, columns, i, kwargs) - for i in range(n_row_groups) - ] - splited_dfs = ray.get( - [_split_df.remote(df, chunksize) for df in df_from_row_groups]) - df_remotes = list(chain.from_iterable(splited_dfs)) - - return DataFrame(row_partitions=df_remotes, columns=columns) - - -@ray.remote -def _read_parquet_row_group(path, columns, row_group_id, kwargs={}): - """Read a parquet row_group given file_path. - """ - pf = ParquetFile(path) - df = pf.read_row_group(row_group_id, columns=columns, **kwargs).to_pandas() - return df - - -@ray.remote -def _split_df(pandas_df, chunksize): - """Split a pandas_df into partitions. - - Returns: - remote_df_ids ([ObjectID]) - """ - dataframes = [] - - while len(pandas_df) > chunksize: - t_df = pandas_df[:chunksize] - t_df.reset_index(drop=True) - top = ray.put(t_df) - dataframes.append(top) - pandas_df = pandas_df[chunksize:] - else: - pandas_df = pandas_df.reset_index(drop=True) - dataframes.append(ray.put(pandas_df)) - - return dataframes - - -# CSV -def _compute_offset(fn, npartitions, ignore_first_line=False): - """ - Calculate the currect bytes offsets for a csv file. - Return a list of (start, end) tuple where the end == \n or EOF. - """ - total_bytes = os.path.getsize(fn) - bio = open(fn, 'rb') - if ignore_first_line: - start = len(bio.readline()) - chunksize = (total_bytes - start) // npartitions - else: - start = 0 - chunksize = total_bytes // npartitions - if chunksize == 0: - chunksize = 1 - - offsets = [] - while start < total_bytes: - bio.seek(chunksize, 1) # Move forward {chunksize} bytes - extend_line = bio.readline() # Move after the next \n - total_offset = chunksize + len(extend_line) - # The position of the \n we just crossed. - new_line_cursor = start + total_offset - 1 - offsets.append((start, new_line_cursor)) - start = new_line_cursor + 1 - - bio.close() - return offsets - - -def _get_firstline(file_path): - bio = open(file_path, 'rb') - first = bio.readline() - bio.close() - return first - - -def _infer_column(first_line, kwargs={}): - return pandas.read_csv(BytesIO(first_line), **kwargs).columns - - -@ray.remote -def _read_csv_with_offset(fn, start, end, kwargs={}, header=b''): - kwargs["quoting"] = int(kwargs["quoting"]) # See issue #2078 - - bio = open(fn, 'rb') - bio.seek(start) - to_read = header + bio.read(end - start) - bio.close() - pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs) - index = pandas_df.index - # Partitions must have RangeIndex - pandas_df.index = pandas.RangeIndex(0, len(pandas_df)) - return pandas_df, index - - -@ray.remote -def get_index(*partition_indices): - return partition_indices[0].append(partition_indices[1:]) - - -def read_csv(filepath_or_buffer, - sep=',', - delimiter=None, - header='infer', - names=None, - index_col=None, - usecols=None, - squeeze=False, - prefix=None, - mangle_dupe_cols=True, - dtype=None, - engine=None, - converters=None, - true_values=None, - false_values=None, - skipinitialspace=False, - skiprows=None, - nrows=None, - na_values=None, - keep_default_na=True, - na_filter=True, - verbose=False, - skip_blank_lines=True, - parse_dates=False, - infer_datetime_format=False, - keep_date_col=False, - date_parser=None, - dayfirst=False, - iterator=False, - chunksize=None, - compression='infer', - thousands=None, - decimal=b'.', - lineterminator=None, - quotechar='"', - quoting=0, - escapechar=None, - comment=None, - encoding=None, - dialect=None, - tupleize_cols=None, - error_bad_lines=True, - warn_bad_lines=True, - skipfooter=0, - skip_footer=0, - doublequote=True, - delim_whitespace=False, - as_recarray=None, - compact_ints=None, - use_unsigned=None, - low_memory=True, - buffer_lines=None, - memory_map=False, - float_precision=None): - """Read csv file from local disk. - - Args: - filepath: - The filepath of the csv file. - We only support local files for now. - kwargs: Keyword arguments in pandas::from_csv - """ - - kwargs = { - 'sep': sep, - 'delimiter': delimiter, - 'header': header, - 'names': names, - 'index_col': index_col, - 'usecols': usecols, - 'squeeze': squeeze, - 'prefix': prefix, - 'mangle_dupe_cols': mangle_dupe_cols, - 'dtype': dtype, - 'engine': engine, - 'converters': converters, - 'true_values': true_values, - 'false_values': false_values, - 'skipinitialspace': skipinitialspace, - 'skiprows': skiprows, - 'nrows': nrows, - 'na_values': na_values, - 'keep_default_na': keep_default_na, - 'na_filter': na_filter, - 'verbose': verbose, - 'skip_blank_lines': skip_blank_lines, - 'parse_dates': parse_dates, - 'infer_datetime_format': infer_datetime_format, - 'keep_date_col': keep_date_col, - 'date_parser': date_parser, - 'dayfirst': dayfirst, - 'iterator': iterator, - 'chunksize': chunksize, - 'compression': compression, - 'thousands': thousands, - 'decimal': decimal, - 'lineterminator': lineterminator, - 'quotechar': quotechar, - 'quoting': quoting, - 'escapechar': escapechar, - 'comment': comment, - 'encoding': encoding, - 'dialect': dialect, - 'tupleize_cols': tupleize_cols, - 'error_bad_lines': error_bad_lines, - 'warn_bad_lines': warn_bad_lines, - 'skipfooter': skipfooter, - 'skip_footer': skip_footer, - 'doublequote': doublequote, - 'delim_whitespace': delim_whitespace, - 'as_recarray': as_recarray, - 'compact_ints': compact_ints, - 'use_unsigned': use_unsigned, - 'low_memory': low_memory, - 'buffer_lines': buffer_lines, - 'memory_map': memory_map, - 'float_precision': float_precision, - } - - # Default to Pandas read_csv for non-serializable objects - if not isinstance(filepath_or_buffer, str) or \ - _infer_compression(filepath_or_buffer, compression) is not None: - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - pandas_obj = pandas.read_csv(filepath_or_buffer, **kwargs) - if isinstance(pandas_obj, pandas.DataFrame): - return from_pandas(pandas_obj, get_npartitions()) - - return pandas_obj - - filepath = filepath_or_buffer - - # TODO: handle case where header is a list of lines - first_line = _get_firstline(filepath) - columns = _infer_column(first_line, kwargs=kwargs) - if header is None or (header == "infer" and names is not None): - first_line = b"" - ignore_first_line = False - else: - ignore_first_line = True - - offsets = _compute_offset(filepath, get_npartitions(), - ignore_first_line=ignore_first_line) - - # Serialize objects to speed up later use in remote tasks - first_line_id = ray.put(first_line) - kwargs_id = ray.put(kwargs) - - df_obj_ids = [] - index_obj_ids = [] - for start, end in offsets: - if start != 0: - df, index = _read_csv_with_offset._submit( - args=(filepath, start, end, kwargs_id, first_line_id), - num_return_vals=2) - else: - df, index = _read_csv_with_offset._submit( - args=(filepath, start, end, kwargs_id), - num_return_vals=2) - df_obj_ids.append(df) - index_obj_ids.append(index) - - index = get_index.remote(*index_obj_ids) if index_col is not None else None - - return DataFrame(row_partitions=df_obj_ids, columns=columns, index=index) - - -def read_json(path_or_buf=None, - orient=None, - typ='frame', - dtype=True, - convert_axes=True, - convert_dates=True, - keep_default_dates=True, - numpy=False, - precise_float=False, - date_unit=None, - encoding=None, - lines=False, - chunksize=None, - compression='infer'): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = pandas.read_json(path_or_buf, orient, typ, dtype, - convert_axes, convert_dates, - keep_default_dates, numpy, precise_float, - date_unit, encoding, lines, chunksize, - compression) - ray_frame = from_pandas(port_frame, get_npartitions()) - - return ray_frame - - -def read_html(io, - match='.+', - flavor=None, - header=None, - index_col=None, - skiprows=None, - attrs=None, - parse_dates=False, - tupleize_cols=None, - thousands=',', - encoding=None, - decimal='.', - converters=None, - na_values=None, - keep_default_na=True): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = pandas.read_html(io, match, flavor, header, index_col, - skiprows, attrs, parse_dates, tupleize_cols, - thousands, encoding, decimal, converters, - na_values, keep_default_na) - ray_frame = from_pandas(port_frame[0], get_npartitions()) - - return ray_frame - - -def read_clipboard(sep=r'\s+'): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = pandas.read_clipboard(sep) - ray_frame = from_pandas(port_frame, get_npartitions()) - - return ray_frame - - -def read_excel(io, - sheet_name=0, - header=0, - skiprows=None, - skip_footer=0, - index_col=None, - names=None, - usecols=None, - parse_dates=False, - date_parser=None, - na_values=None, - thousands=None, - convert_float=True, - converters=None, - dtype=None, - true_values=None, - false_values=None, - engine=None, - squeeze=False): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = pandas.read_excel(io, sheet_name, header, skiprows, - skip_footer, index_col, names, usecols, - parse_dates, date_parser, na_values, - thousands, convert_float, converters, dtype, - true_values, false_values, engine, squeeze) - ray_frame = from_pandas(port_frame, get_npartitions()) - - return ray_frame - - -def read_hdf(path_or_buf, - key=None, - mode='r'): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = pandas.read_hdf(path_or_buf, key, mode) - ray_frame = from_pandas(port_frame, get_npartitions()) - - return ray_frame - - -def read_feather(path, - nthreads=1): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = pandas.read_feather(path) - ray_frame = from_pandas(port_frame, get_npartitions()) - - return ray_frame - - -def read_msgpack(path_or_buf, - encoding='utf-8', - iterator=False): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = pandas.read_msgpack(path_or_buf, encoding, iterator) - ray_frame = from_pandas(port_frame, get_npartitions()) - - return ray_frame - - -def read_stata(filepath_or_buffer, - convert_dates=True, - convert_categoricals=True, - encoding=None, - index_col=None, - convert_missing=False, - preserve_dtypes=True, - columns=None, - order_categoricals=True, - chunksize=None, - iterator=False): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = pandas.read_stata(filepath_or_buffer, convert_dates, - convert_categoricals, encoding, index_col, - convert_missing, preserve_dtypes, columns, - order_categoricals, chunksize, iterator) - ray_frame = from_pandas(port_frame, get_npartitions()) - - return ray_frame - - -def read_sas(filepath_or_buffer, - format=None, - index=None, - encoding=None, - chunksize=None, - iterator=False): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = pandas.read_sas(filepath_or_buffer, format, index, encoding, - chunksize, iterator) - ray_frame = from_pandas(port_frame, get_npartitions()) - - return ray_frame - - -def read_pickle(path, - compression='infer'): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = pandas.read_pickle(path, compression) - ray_frame = from_pandas(port_frame, get_npartitions()) - - return ray_frame - - -def read_sql(sql, - con, - index_col=None, - coerce_float=True, - params=None, - parse_dates=None, - columns=None, - chunksize=None): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = pandas.read_sql(sql, con, index_col, coerce_float, params, - parse_dates, columns, chunksize) - ray_frame = from_pandas(port_frame, get_npartitions()) - - return ray_frame diff --git a/python/ray/dataframe/iterator.py b/python/ray/dataframe/iterator.py deleted file mode 100644 index b26d1cf46..000000000 --- a/python/ray/dataframe/iterator.py +++ /dev/null @@ -1,29 +0,0 @@ -from collections import Iterator - - -class PartitionIterator(Iterator): - def __init__(self, partitions, func): - """PartitionIterator class to define a generator on partitioned data - - Args: - partitions ([ObjectID]): Partitions to iterate over - func (callable): The function to get inner iterables from - each partition - """ - self.partitions = iter(partitions) - self.func = func - self.iter_cache = iter([]) - - def __iter__(self): - return self - - def __next__(self): - return self.next() - - def next(self): - try: - return next(self.iter_cache) - except StopIteration: - next_partition = next(self.partitions) - self.iter_cache = self.func(next_partition) - return self.next() diff --git a/python/ray/dataframe/pandas_code_gen.py b/python/ray/dataframe/pandas_code_gen.py deleted file mode 100644 index 8a122e530..000000000 --- a/python/ray/dataframe/pandas_code_gen.py +++ /dev/null @@ -1,94 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import inspect - - -def code_gen(pandas_obj, ray_obj, path): - """Generate code skeleton for methods not in Ray - - Args: - pandas_obj: The pandas object to generate code from. - ray_obj: The ray object to diff against. - path: Path to output the file to. - """ - - with open(path, "w") as outfile: - funcs = pandas_ray_diff(pandas_obj, ray_obj) - - for func in funcs: - if func[0] == "_" and func[1] != "_": - continue - if "attr" in func: - # let's not mess with these - continue - try: - outfile.write( - "\ndef " + func + - str(inspect.signature(getattr(pandas_obj, func))) + ":\n") - - except TypeError: - outfile.write("\n@property") - outfile.write("\ndef " + func + "(self):\n") - except ValueError: - continue - outfile.write( - " raise NotImplementedError(\"Not Yet implemented.\")\n") - - -def code_gen_test(ray_obj, path, name): - """Generate tests for methods in Ray.""" - - with open(path, "a") as outfile: - funcs = dir(ray_obj) - - for func in funcs: - if func[0] == "_" and func[1] != "_": - continue - - outfile.write("\n\ndef test_" + func + "():\n") - outfile.write( - " ray_" + name + " = create_test_" + name + "()\n\n" + - " with pytest.raises(NotImplementedError):\n" + - " ray_" + name + "." + func) - try: - first = True - param_num = \ - len(inspect.signature(getattr(ray_obj, func)).parameters) - if param_num > 1: - param_num -= 1 - - for _ in range(param_num): - if first: - outfile.write("(None") - first = False - else: - outfile.write(", None") - except (TypeError, ValueError, NotImplementedError): - outfile.write("\n") - continue - - if first: - outfile.write("(") - outfile.write(")\n") - - -def pandas_ray_diff(pandas_obj, ray_obj): - """Gets the diff of the methods in the Pandas and Ray objects. - - Args: - pandas_obj: The Pandas object to diff. - ray_obj: The Ray object to diff. - - Returns: - A list of method names that are different between the two. - """ - pandas_funcs = dir(pandas_obj) - ray_funcs = dir(ray_obj) - - pandas_funcs = set(filter(lambda f: f[0] != "_" or f[1] == "_", - pandas_funcs)) - - diff = [x for x in pandas_funcs if x not in set(ray_funcs)] - return diff diff --git a/python/ray/dataframe/reshape.py b/python/ray/dataframe/reshape.py deleted file mode 100644 index 1883f11b7..000000000 --- a/python/ray/dataframe/reshape.py +++ /dev/null @@ -1,125 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import ray -import pandas -import numpy as np - -from pandas import compat -from pandas.core.dtypes.common import is_list_like -from itertools import cycle - -from .dataframe import DataFrame -from .utils import _deploy_func - - -def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, - columns=None, sparse=False, drop_first=False): - """Convert categorical variable into indicator variables. - - Args: - data (array-like, Series, or DataFrame): data to encode. - prefix (string, [string]): Prefix to apply to each encoded column - label. - prefix_sep (string, [string]): Separator between prefix and value. - dummy_na (bool): Add a column to indicate NaNs. - columns: Which columns to encode. - sparse (bool): Not Implemented: If True, returns SparseDataFrame. - drop_first (bool): Whether to remove the first level of encoded data. - - Returns: - DataFrame or one-hot encoded data. - """ - if not isinstance(data, DataFrame): - return pandas.get_dummies(data, prefix=prefix, prefix_sep=prefix_sep, - dummy_na=dummy_na, columns=columns, - sparse=sparse, drop_first=drop_first) - - if sparse: - raise NotImplementedError( - "SparseDataFrame is not implemented. " - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - if columns is None: - columns_to_encode = data.dtypes.isin([np.dtype("O"), 'category']) - columns_to_encode = data.columns[columns_to_encode] - else: - columns_to_encode = columns - - def check_len(item, name): - len_msg = ("Length of '{name}' ({len_item}) did not match the " - "length of the columns being encoded ({len_enc}).") - - if is_list_like(item): - if not len(item) == len(columns_to_encode): - len_msg = len_msg.format(name=name, len_item=len(item), - len_enc=len(columns_to_encode)) - raise ValueError(len_msg) - - check_len(prefix, 'prefix') - check_len(prefix_sep, 'prefix_sep') - if isinstance(prefix, compat.string_types): - prefix = cycle([prefix]) - prefix = [next(prefix) for i in range(len(columns_to_encode))] - if isinstance(prefix, dict): - prefix = [prefix[col] for col in columns_to_encode] - - if prefix is None: - prefix = columns_to_encode - - # validate separators - if isinstance(prefix_sep, compat.string_types): - prefix_sep = cycle([prefix_sep]) - prefix_sep = [next(prefix_sep) for i in range(len(columns_to_encode))] - elif isinstance(prefix_sep, dict): - prefix_sep = [prefix_sep[col] for col in columns_to_encode] - - if set(columns_to_encode) == set(data.columns): - with_dummies = [] - dropped_columns = pandas.Index() - else: - with_dummies = data.drop(columns_to_encode, axis=1)._col_partitions - dropped_columns = data.columns.drop(columns_to_encode) - - def get_dummies_remote(df, to_drop, prefix, prefix_sep): - df = df.drop(to_drop, axis=1) - - if df.size == 0: - return df, df.columns - - df = pandas.get_dummies(df, prefix=prefix, prefix_sep=prefix_sep, - dummy_na=dummy_na, columns=None, sparse=sparse, - drop_first=drop_first) - columns = df.columns - df.columns = pandas.RangeIndex(0, len(df.columns)) - return df, columns - - total = 0 - columns = [] - for i, part in enumerate(data._col_partitions): - col_index = data._col_metadata.partition_series(i) - - # TODO(kunalgosar): Handle the case of duplicate columns here - to_encode = col_index.index.isin(columns_to_encode) - - to_encode = col_index[to_encode] - to_drop = col_index.drop(to_encode.index) - - result = _deploy_func._submit( - args=(get_dummies_remote, part, to_drop, - prefix[total:total + len(to_encode)], - prefix_sep[total:total + len(to_encode)]), - num_return_vals=2) - - with_dummies.append(result[0]) - columns.append(result[1]) - total += len(to_encode) - - columns = ray.get(columns) - dropped_columns = dropped_columns.append(columns) - - return DataFrame(col_partitions=with_dummies, - columns=dropped_columns, - index=data.index) diff --git a/python/ray/dataframe/series.py b/python/ray/dataframe/series.py deleted file mode 100644 index aa102363d..000000000 --- a/python/ray/dataframe/series.py +++ /dev/null @@ -1,970 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import pandas - -from .utils import _inherit_docstrings - - -def na_op(): - """Pandas uses a similar function to handle na values. - """ - raise NotImplementedError("Not Yet implemented.") - - -@_inherit_docstrings(pandas.Series, excluded=[pandas.Series, - pandas.Series.__init__]) -class Series(object): - - def __init__(self, series_oids): - """Constructor for a Series object. - - Args: - series_oids ([ObjectID]): The list of remote Series objects. - """ - self.series_oids = series_oids - - @property - def T(self): - raise NotImplementedError("Not Yet implemented.") - - def __abs__(self): - raise NotImplementedError("Not Yet implemented.") - - def __add__(self, right, name='__add__', na_op=na_op): - raise NotImplementedError("Not Yet implemented.") - - def __and__(self, other): - raise NotImplementedError("Not Yet implemented.") - - def __array__(self, result=None): - raise NotImplementedError("Not Yet implemented.") - - def __array_prepare__(self, result, context=None): - raise NotImplementedError("Not Yet implemented.") - - @property - def __array_priority__(self): - raise NotImplementedError("Not Yet implemented.") - - def __array_wrap__(self, result, context=None): - raise NotImplementedError("Not Yet implemented.") - - def __bool__(self): - raise NotImplementedError("Not Yet implemented.") - - def __bytes__(self): - raise NotImplementedError("Not Yet implemented.") - - def __class__(self, data=None, index=None, dtype=None, name=None, - copy=False, fastpath=False): - raise NotImplementedError("Not Yet implemented.") - - def __contains__(self, key): - raise NotImplementedError("Not Yet implemented.") - - def __copy__(self, deep=True): - raise NotImplementedError("Not Yet implemented.") - - def __deepcopy__(self, memo=None): - raise NotImplementedError("Not Yet implemented.") - - def __delitem__(self, key): - raise NotImplementedError("Not Yet implemented.") - - def __dir__(self): - return list(type(self).__dict__.keys()) - - def __div__(self, right, name='__truediv__', na_op=na_op): - raise NotImplementedError("Not Yet implemented.") - - def __divmod__(self, right, name='__divmod__', na_op=na_op): - raise NotImplementedError("Not Yet implemented.") - - @property - def __doc__(self): - raise NotImplementedError("Not Yet implemented.") - - def __eq__(self, other, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def __finalize__(self, other, method=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def __float__(self): - raise NotImplementedError("Not Yet implemented.") - - def __floordiv__(self, right, name='__floordiv__', na_op=na_op): - raise NotImplementedError("Not Yet implemented.") - - def __ge__(self, other, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def __getitem__(self, key): - raise NotImplementedError("Not Yet implemented.") - - def __getstate__(self): - raise NotImplementedError("Not Yet implemented.") - - def __gt__(self, other, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def __iadd__(self, other): - raise NotImplementedError("Not Yet implemented.") - - def __imul__(self, other): - raise NotImplementedError("Not Yet implemented.") - - def __int__(self): - raise NotImplementedError("Not Yet implemented.") - - def __invert__(self): - raise NotImplementedError("Not Yet implemented.") - - def __ipow__(self, other): - raise NotImplementedError("Not Yet implemented.") - - def __isub__(self, other): - raise NotImplementedError("Not Yet implemented.") - - def __iter__(self): - raise NotImplementedError("Not Yet implemented.") - - def __itruediv__(self, other): - raise NotImplementedError("Not Yet implemented.") - - def __le__(self, other, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def __len__(self): - raise NotImplementedError("Not Yet implemented.") - - def __long__(self): - raise NotImplementedError("Not Yet implemented.") - - def __lt__(self, other, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def __mod__(self, right, name='__mod__', na_op=na_op): - raise NotImplementedError("Not Yet implemented.") - - def __mul__(self, right, name='__mul__', na_op=na_op): - raise NotImplementedError("Not Yet implemented.") - - def __ne__(self, other, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def __neg__(self): - raise NotImplementedError("Not Yet implemented.") - - def __nonzero__(self): - raise NotImplementedError("Not Yet implemented.") - - def __or__(self, other): - raise NotImplementedError("Not Yet implemented.") - - def __pow__(self, right, name='__pow__', na_op=na_op): - raise NotImplementedError("Not Yet implemented.") - - def __repr__(self): - raise NotImplementedError("Not Yet implemented.") - - def __round__(self, decimals=0): - raise NotImplementedError("Not Yet implemented.") - - def __setitem__(self, key, value): - raise NotImplementedError("Not Yet implemented.") - - def __setstate__(self, state): - raise NotImplementedError("Not Yet implemented.") - - def __sizeof__(self): - raise NotImplementedError("Not Yet implemented.") - - def __str__(self): - raise NotImplementedError("Not Yet implemented.") - - def __sub__(self, right, name='__sub__', na_op=na_op): - raise NotImplementedError("Not Yet implemented.") - - def __truediv__(self, right, name='__truediv__', na_op=na_op): - raise NotImplementedError("Not Yet implemented.") - - def __xor__(self, other): - raise NotImplementedError("Not Yet implemented.") - - def abs(self): - raise NotImplementedError("Not Yet implemented.") - - def add(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def add_prefix(self, prefix): - raise NotImplementedError("Not Yet implemented.") - - def add_suffix(self, suffix): - raise NotImplementedError("Not Yet implemented.") - - def agg(self, func, axis=0, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def aggregate(self, func, axis=0, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, - broadcast_axis=None): - raise NotImplementedError("Not Yet implemented.") - - def all(self, axis=None, bool_only=None, skipna=None, level=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def any(self, axis=None, bool_only=None, skipna=None, level=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def append(self, to_append, ignore_index=False, verify_integrity=False): - raise NotImplementedError("Not Yet implemented.") - - def apply(self, func, convert_dtype=True, args=(), **kwds): - raise NotImplementedError("Not Yet implemented.") - - def argmax(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def argmin(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def argsort(self, axis=0, kind='quicksort', order=None): - raise NotImplementedError("Not Yet implemented.") - - def as_blocks(self, copy=True): - raise NotImplementedError("Not Yet implemented.") - - def as_matrix(self, columns=None): - raise NotImplementedError("Not Yet implemented.") - - def asfreq(self, freq, method=None, how=None, normalize=False, - fill_value=None): - raise NotImplementedError("Not Yet implemented.") - - def asof(self, where, subset=None): - raise NotImplementedError("Not Yet implemented.") - - def astype(self, dtype, copy=True, errors='raise', **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def at(self, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def at_time(self, time, asof=False): - raise NotImplementedError("Not Yet implemented.") - - def autocorr(self, lag=1): - raise NotImplementedError("Not Yet implemented.") - - def between(self, left, right, inclusive=True): - raise NotImplementedError("Not Yet implemented.") - - def between_time(self, start_time, end_time, include_start=True, - include_end=True): - raise NotImplementedError("Not Yet implemented.") - - def bfill(self, axis=None, inplace=False, limit=None, downcast=None): - raise NotImplementedError("Not Yet implemented.") - - def bool(self): - raise NotImplementedError("Not Yet implemented.") - - def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def clip_lower(self, threshold, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def clip_upper(self, threshold, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def combine(self, other, func, fill_value=np.nan): - raise NotImplementedError("Not Yet implemented.") - - def combine_first(self, other): - raise NotImplementedError("Not Yet implemented.") - - def compound(self, axis=None, skipna=None, level=None): - raise NotImplementedError("Not Yet implemented.") - - def compress(self, condition, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def consolidate(self, inplace=False): - raise NotImplementedError("Not Yet implemented.") - - def convert_objects(self, convert_dates=True, convert_numeric=False, - convert_timedeltas=True, copy=True): - raise NotImplementedError("Not Yet implemented.") - - def copy(self, deep=True): - raise NotImplementedError("Not Yet implemented.") - - def corr(self, other, method='pearson', min_periods=None): - raise NotImplementedError("Not Yet implemented.") - - def count(self, level=None): - raise NotImplementedError("Not Yet implemented.") - - def cov(self, other, min_periods=None): - raise NotImplementedError("Not Yet implemented.") - - def cummax(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def cummin(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def cumprod(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def cumsum(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def describe(self, percentiles=None, include=None, exclude=None): - raise NotImplementedError("Not Yet implemented.") - - def diff(self, periods=1): - raise NotImplementedError("Not Yet implemented.") - - def div(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def divide(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def dot(self, other): - raise NotImplementedError("Not Yet implemented.") - - def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'): - raise NotImplementedError("Not Yet implemented.") - - def drop_duplicates(self, keep='first', inplace=False): - raise NotImplementedError("Not Yet implemented.") - - def dropna(self, axis=0, inplace=False, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def duplicated(self, keep='first'): - raise NotImplementedError("Not Yet implemented.") - - def eq(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def equals(self, other): - raise NotImplementedError("Not Yet implemented.") - - def ewm(self, com=None, span=None, halflife=None, alpha=None, - min_periods=0, freq=None, adjust=True, ignore_na=False, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def expanding(self, min_periods=1, freq=None, center=False, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def factorize(self, sort=False, na_sentinel=-1): - raise NotImplementedError("Not Yet implemented.") - - def ffill(self, axis=None, inplace=False, limit=None, downcast=None): - raise NotImplementedError("Not Yet implemented.") - - def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def filter(self, items=None, like=None, regex=None, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def first(self, offset): - raise NotImplementedError("Not Yet implemented.") - - def first_valid_index(self): - raise NotImplementedError("Not Yet implemented.") - - def floordiv(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def from_array(self, arr, index=None, name=None, dtype=None, copy=False, - fastpath=False): - raise NotImplementedError("Not Yet implemented.") - - def from_csv(self, path, sep=',', parse_dates=True, header=None, - index_col=0, encoding=None, infer_datetime_format=False): - raise NotImplementedError("Not Yet implemented.") - - def ge(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def get(self, key, default=None): - raise NotImplementedError("Not Yet implemented.") - - def get_dtype_counts(self): - raise NotImplementedError("Not Yet implemented.") - - def get_ftype_counts(self): - raise NotImplementedError("Not Yet implemented.") - - def get_value(self, label, takeable=False): - raise NotImplementedError("Not Yet implemented.") - - def get_values(self): - raise NotImplementedError("Not Yet implemented.") - - def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def gt(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def head(self, n=5): - raise NotImplementedError("Not Yet implemented.") - - def hist(self, by=None, ax=None, grid=True, xlabelsize=None, xrot=None, - ylabelsize=None, yrot=None, figsize=None, bins=10, **kwds): - raise NotImplementedError("Not Yet implemented.") - - def iat(self, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def idxmax(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def idxmin(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def iloc(self, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', downcast=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def isin(self, values): - raise NotImplementedError("Not Yet implemented.") - - def isnull(self): - raise NotImplementedError("Not Yet implemented.") - - def item(self): - raise NotImplementedError("Not Yet implemented.") - - def items(self): - raise NotImplementedError("Not Yet implemented.") - - def iteritems(self): - raise NotImplementedError("Not Yet implemented.") - - def ix(self, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def keys(self): - raise NotImplementedError("Not Yet implemented.") - - def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def last(self, offset): - raise NotImplementedError("Not Yet implemented.") - - def last_valid_index(self): - raise NotImplementedError("Not Yet implemented.") - - def le(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def loc(self, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def lt(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def mad(self, axis=None, skipna=None, level=None): - raise NotImplementedError("Not Yet implemented.") - - def map(self, arg, na_action=None): - raise NotImplementedError("Not Yet implemented.") - - def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, - try_cast=False, raise_on_error=True): - raise NotImplementedError("Not Yet implemented.") - - def max(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def mean(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def median(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def memory_usage(self, index=True, deep=False): - raise NotImplementedError("Not Yet implemented.") - - def min(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def mod(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def mode(self): - raise NotImplementedError("Not Yet implemented.") - - def mul(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def multiply(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def ne(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def nlargest(self, n=5, keep='first'): - raise NotImplementedError("Not Yet implemented.") - - def nonzero(self): - raise NotImplementedError("Not Yet implemented.") - - def notnull(self): - raise NotImplementedError("Not Yet implemented.") - - def nsmallest(self, n=5, keep='first'): - raise NotImplementedError("Not Yet implemented.") - - def nunique(self, dropna=True): - raise NotImplementedError("Not Yet implemented.") - - def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def pipe(self, func, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def plot(self, kind='line', ax=None, figsize=None, use_index=True, - title=None, grid=None, legend=False, style=None, logx=False, - logy=False, loglog=False, xticks=None, yticks=None, xlim=None, - ylim=None, rot=None, fontsize=None, colormap=None, table=False, - yerr=None, xerr=None, label=None, secondary_y=False, **kwds): - raise NotImplementedError("Not Yet implemented.") - - def pop(self, item): - raise NotImplementedError("Not Yet implemented.") - - def pow(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def prod(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def product(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def ptp(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def put(self, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def quantile(self, q=0.5, interpolation='linear'): - raise NotImplementedError("Not Yet implemented.") - - def radd(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def rank(self, axis=0, method='average', numeric_only=None, - na_option='keep', ascending=True, pct=False): - raise NotImplementedError("Not Yet implemented.") - - def ravel(self, order='C'): - raise NotImplementedError("Not Yet implemented.") - - def rdiv(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def reindex(self, index=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def reindex_axis(self, labels, axis=0, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def reindex_like(self, other, method=None, copy=True, limit=None, - tolerance=None): - raise NotImplementedError("Not Yet implemented.") - - def rename(self, index=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def rename_axis(self, mapper, axis=0, copy=True, inplace=False): - raise NotImplementedError("Not Yet implemented.") - - def reorder_levels(self, order): - raise NotImplementedError("Not Yet implemented.") - - def repeat(self, repeats, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def replace(self, to_replace=None, value=None, inplace=False, limit=None, - regex=False, method='pad', axis=None): - raise NotImplementedError("Not Yet implemented.") - - def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, - label=None, convention='start', kind=None, loffset=None, - limit=None, base=0, on=None, level=None): - raise NotImplementedError("Not Yet implemented.") - - def reset_index(self, level=None, drop=False, name=None, inplace=False): - raise NotImplementedError("Not Yet implemented.") - - def reshape(self, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def rfloordiv(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def rmod(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def rmul(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def rolling(self, window, min_periods=None, freq=None, center=False, - win_type=None, on=None, axis=0, closed=None): - raise NotImplementedError("Not Yet implemented.") - - def round(self, decimals=0, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def rpow(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def rsub(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def rtruediv(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def sample(self, n=None, frac=None, replace=False, weights=None, - random_state=None, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def searchsorted(self, value, side='left', sorter=None): - raise NotImplementedError("Not Yet implemented.") - - def select(self, crit, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def sem(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def set_axis(self, axis, labels): - raise NotImplementedError("Not Yet implemented.") - - def set_value(self, label, value, takeable=False): - raise NotImplementedError("Not Yet implemented.") - - def shift(self, periods=1, freq=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def slice_shift(self, periods=1, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def sort_index(self, axis=0, level=None, ascending=True, inplace=False, - kind='quicksort', na_position='last', sort_remaining=True): - raise NotImplementedError("Not Yet implemented.") - - def sort_values(self, axis=0, ascending=True, inplace=False, - kind='quicksort', na_position='last'): - raise NotImplementedError("Not Yet implemented.") - - def sortlevel(self, level=0, ascending=True, sort_remaining=True): - raise NotImplementedError("Not Yet implemented.") - - def squeeze(self, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def std(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def sub(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def subtract(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def sum(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def swapaxes(self, axis1, axis2, copy=True): - raise NotImplementedError("Not Yet implemented.") - - def swaplevel(self, i=-2, j=-1, copy=True): - raise NotImplementedError("Not Yet implemented.") - - def tail(self, n=5): - raise NotImplementedError("Not Yet implemented.") - - def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def to_clipboard(self, excel=None, sep=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def to_csv(self, path=None, index=True, sep=',', na_rep='', - float_format=None, header=False, index_label=None, mode='w', - encoding=None, date_format=None, decimal='.'): - raise NotImplementedError("Not Yet implemented.") - - def to_dense(self): - raise NotImplementedError("Not Yet implemented.") - - def to_dict(self): - raise NotImplementedError("Not Yet implemented.") - - def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', - float_format=None, columns=None, header=True, index=True, - index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep='inf', - verbose=True): - raise NotImplementedError("Not Yet implemented.") - - def to_frame(self, name=None): - raise NotImplementedError("Not Yet implemented.") - - def to_hdf(self, path_or_buf, key, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def to_json(self, path_or_buf=None, orient=None, date_format=None, - double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None, lines=False): - raise NotImplementedError("Not Yet implemented.") - - def to_latex(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, bold_rows=False, - column_format=None, longtable=None, escape=None, - encoding=None, decimal='.', multicolumn=None, - multicolumn_format=None, multirow=None): - raise NotImplementedError("Not Yet implemented.") - - def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def to_period(self, freq=None, copy=True): - raise NotImplementedError("Not Yet implemented.") - - def to_pickle(self, path, compression='infer'): - raise NotImplementedError("Not Yet implemented.") - - def to_sparse(self, kind='block', fill_value=None): - raise NotImplementedError("Not Yet implemented.") - - def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail', - index=True, index_label=None, chunksize=None, dtype=None): - raise NotImplementedError("Not Yet implemented.") - - def to_string(self, buf=None, na_rep='NaN', float_format=None, - header=True, index=True, length=False, dtype=False, - name=False, max_rows=None): - raise NotImplementedError("Not Yet implemented.") - - def to_timestamp(self, freq=None, how='start', copy=True): - raise NotImplementedError("Not Yet implemented.") - - def to_xarray(self): - raise NotImplementedError("Not Yet implemented.") - - def tolist(self): - raise NotImplementedError("Not Yet implemented.") - - def transform(self, func, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def transpose(self, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def truediv(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def truncate(self, before=None, after=None, axis=None, copy=True): - raise NotImplementedError("Not Yet implemented.") - - def tshift(self, periods=1, freq=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def tz_convert(self, tz, axis=0, level=None, copy=True): - raise NotImplementedError("Not Yet implemented.") - - def tz_localize(self, tz, axis=0, level=None, copy=True, - ambiguous='raise'): - raise NotImplementedError("Not Yet implemented.") - - def unique(self): - raise NotImplementedError("Not Yet implemented.") - - def unstack(self, level=-1, fill_value=None): - raise NotImplementedError("Not Yet implemented.") - - def upandasate(self, other): - raise NotImplementedError("Not Yet implemented.") - - def valid(self, inplace=False, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def value_counts(self, normalize=False, sort=True, ascending=False, - bins=None, dropna=True): - raise NotImplementedError("Not Yet implemented.") - - def var(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def view(self, dtype=None): - raise NotImplementedError("Not Yet implemented.") - - def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, - try_cast=False, raise_on_error=True): - raise NotImplementedError("Not Yet implemented.") - - def xs(key, axis=0, level=None, drop_level=True): - raise NotImplementedError("Not Yet implemented.") - - @property - def asobject(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def axes(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def base(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def blocks(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def data(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def dtype(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def dtypes(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def empty(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def flags(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def ftype(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def ftypes(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def hasnans(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def imag(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def index(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def is_copy(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def is_monotonic(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def is_monotonic_decreasing(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def is_monotonic_increasing(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def is_unique(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def itemsize(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def name(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def nbytes(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def ndim(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def real(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def shape(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def size(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def strides(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def values(self): - raise NotImplementedError("Not Yet implemented.") diff --git a/python/ray/dataframe/test/test_concat.py b/python/ray/dataframe/test/test_concat.py deleted file mode 100644 index 62e881d05..000000000 --- a/python/ray/dataframe/test/test_concat.py +++ /dev/null @@ -1,127 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import pytest -import pandas -import ray.dataframe as pd -from ray.dataframe.utils import ( - to_pandas, - from_pandas -) - - -@pytest.fixture -def ray_df_equals_pandas(ray_df, pandas_df): - return to_pandas(ray_df).sort_index().equals(pandas_df.sort_index()) - - -@pytest.fixture -def generate_dfs(): - df = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]}) - - df2 = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col6': [12, 13, 14, 15], - 'col7': [0, 0, 0, 0]}) - return df, df2 - - -@pytest.fixture -def generate_none_dfs(): - df = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, None, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [None, None, None, None]}) - - df2 = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col6': [12, 13, 14, 15], - 'col7': [0, 0, 0, 0]}) - return df, df2 - - -@pytest.fixture -def test_df_concat(): - df, df2 = generate_dfs() - - assert(ray_df_equals_pandas(pd.concat([df, df2]), - pandas.concat([df, df2]))) - - -def test_ray_concat(): - df, df2 = generate_dfs() - ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) - - assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2]), - pandas.concat([df, df2])) - - -def test_ray_concat_on_index(): - df, df2 = generate_dfs() - ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) - - assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='index'), - pandas.concat([df, df2], axis='index')) - - assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='rows'), - pandas.concat([df, df2], axis='rows')) - - assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=0), - pandas.concat([df, df2], axis=0)) - - -def test_ray_concat_on_column(): - df, df2 = generate_dfs() - ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) - - assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=1), - pandas.concat([df, df2], axis=1)) - - assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis="columns"), - pandas.concat([df, df2], axis="columns")) - - -def test_invalid_axis_errors(): - df, df2 = generate_dfs() - ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) - - with pytest.raises(ValueError): - pd.concat([ray_df, ray_df2], axis=2) - - -def test_mixed_concat(): - df, df2 = generate_dfs() - df3 = df.copy() - - mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3] - - assert(ray_df_equals_pandas(pd.concat(mixed_dfs), - pandas.concat([df, df2, df3]))) - - -def test_mixed_inner_concat(): - df, df2 = generate_dfs() - df3 = df.copy() - - mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3] - - assert(ray_df_equals_pandas(pd.concat(mixed_dfs, join='inner'), - pandas.concat([df, df2, df3], join='inner'))) - - -def test_mixed_none_concat(): - df, df2 = generate_none_dfs() - df3 = df.copy() - - mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3] - - assert(ray_df_equals_pandas(pd.concat(mixed_dfs), - pandas.concat([df, df2, df3]))) diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py deleted file mode 100644 index 39e2b0585..000000000 --- a/python/ray/dataframe/test/test_dataframe.py +++ /dev/null @@ -1,3353 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import pytest -import numpy as np -import pandas -import pandas.util.testing as tm -from pandas.tests.frame.common import TestData -import ray.dataframe as pd -from ray.dataframe.utils import to_pandas - - -@pytest.fixture -def ray_df_equals_pandas(ray_df, pandas_df): - return to_pandas(ray_df).equals(pandas_df) - - -@pytest.fixture -def ray_series_equals_pandas(ray_series, pandas_series): - return ray_series.equals(pandas_series) - - -@pytest.fixture -def ray_df_equals(ray_df1, ray_df2): - return ray_df1.equals(ray_df2) - - -@pytest.fixture -def create_test_dataframe(): - return pd.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]}) - - -def test_int_dataframe(): - frame_data = {'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]} - pandas_df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - - testfuncs = [lambda x: x + 1, - lambda x: str(x), - lambda x: x * x, - lambda x: x, - lambda x: False] - - query_funcs = ['col1 < col2', 'col3 > col4', 'col1 == col2', - '(col2 > col1) and (col1 < col3)'] - - keys = ['col1', - 'col2', - 'col3', - 'col4'] - - filter_by = {'items': ['col1', 'col5'], - 'regex': '4$|3$', - 'like': 'col'} - - test_filter(ray_df, pandas_df, filter_by) - test_index(ray_df, pandas_df) - test_size(ray_df, pandas_df) - test_ndim(ray_df, pandas_df) - test_ftypes(ray_df, pandas_df) - test_dtypes(ray_df, pandas_df) - test_values(ray_df, pandas_df) - test_axes(ray_df, pandas_df) - test_shape(ray_df, pandas_df) - test_add_prefix(ray_df, pandas_df) - test_add_suffix(ray_df, pandas_df) - - for testfunc in testfuncs: - test_applymap(ray_df, pandas_df, testfunc) - - test_copy(ray_df) - test_sum(ray_df, pandas_df) - test_prod(ray_df, pandas_df) - test_product(ray_df, pandas_df) - test_abs(ray_df, pandas_df) - test_keys(ray_df, pandas_df) - test_transpose(ray_df, pandas_df) - test_round(ray_df, pandas_df) - test_query(ray_df, pandas_df, query_funcs) - - test_mean(ray_df, pandas_df) - test_var(ray_df, pandas_df) - test_std(ray_df, pandas_df) - test_median(ray_df, pandas_df) - - quantiles = [0.25, 0.5, 0.75, 0.66, 0.01] - - for q in quantiles: - test_quantile(ray_df, pandas_df, q) - - test_describe(ray_df, pandas_df) - test_diff(ray_df, pandas_df) - test_rank(ray_df, pandas_df) - - test_all(ray_df, pandas_df) - test_any(ray_df, pandas_df) - test___getitem__(ray_df, pandas_df) - test___neg__(ray_df, pandas_df) - test___iter__(ray_df, pandas_df) - test___abs__(ray_df, pandas_df) - test___delitem__(ray_df, pandas_df) - test___copy__(ray_df, pandas_df) - test___deepcopy__(ray_df, pandas_df) - test_bool(ray_df, pandas_df) - test_count(ray_df, pandas_df) - test_head(ray_df, pandas_df, 2) - test_head(ray_df, pandas_df) - test_tail(ray_df, pandas_df) - test_idxmax(ray_df, pandas_df) - test_idxmin(ray_df, pandas_df) - test_pop(ray_df, pandas_df) - - test___len__(ray_df, pandas_df) - test_first_valid_index(ray_df, pandas_df) - test_last_valid_index(ray_df, pandas_df) - - for key in keys: - test_get(ray_df, pandas_df, key) - - test_get_dtype_counts(ray_df, pandas_df) - test_get_ftype_counts(ray_df, pandas_df) - test_iterrows(ray_df, pandas_df) - test_items(ray_df, pandas_df) - test_iteritems(ray_df, pandas_df) - test_itertuples(ray_df, pandas_df) - - test_max(ray_df, pandas_df) - test_min(ray_df, pandas_df) - test_notna(ray_df, pandas_df) - test_notnull(ray_df, pandas_df) - test_cummax(ray_df, pandas_df) - test_cummin(ray_df, pandas_df) - test_cumprod(ray_df, pandas_df) - test_cumsum(ray_df, pandas_df) - test_pipe(ray_df, pandas_df) - - # test_loc(ray_df, pandas_df) - # test_iloc(ray_df, pandas_df) - - labels = ['a', 'b', 'c', 'd'] - test_set_axis(ray_df, pandas_df, labels, 0) - test_set_axis(ray_df, pandas_df, labels, 'rows') - labels.append('e') - test_set_axis(ray_df, pandas_df, labels, 1) - test_set_axis(ray_df, pandas_df, labels, 'columns') - - for key in keys: - test_set_index(ray_df, pandas_df, key) - - test_reset_index(ray_df, pandas_df) - test_reset_index(ray_df, pandas_df, inplace=True) - - for key in keys: - test___contains__(ray_df, key, True) - test___contains__(ray_df, "Not Exists", False) - - for key in keys: - test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key]) - test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key]) - test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key]) - test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key]) - - test___array__(ray_df, pandas_df) - - apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'], - ['sum', 'sum']] - for func in apply_agg_functions: - test_apply(ray_df, pandas_df, func, 0) - test_aggregate(ray_df, pandas_df, func, 0) - test_agg(ray_df, pandas_df, func, 0) - if not isinstance(func, list): - test_agg(ray_df, pandas_df, func, 1) - test_apply(ray_df, pandas_df, func, 1) - test_aggregate(ray_df, pandas_df, func, 1) - else: - with pytest.raises(TypeError): - test_agg(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_aggregate(ray_df, pandas_df, func, 1) - - func = ['sum', lambda df: df.sum()] - test_apply(ray_df, pandas_df, func, 0) - test_aggregate(ray_df, pandas_df, func, 0) - test_agg(ray_df, pandas_df, func, 0) - with pytest.raises(TypeError): - test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_aggregate(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_agg(ray_df, pandas_df, func, 1) - - test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1) - test_apply(ray_df, pandas_df, lambda df: -df, 0) - test_transform(ray_df, pandas_df) - - -def test_float_dataframe(): - frame_data = {'col1': [0.0, 1.0, 2.0, 3.0], - 'col2': [4.0, 5.0, 6.0, 7.0], - 'col3': [8.0, 9.0, 10.0, 11.0], - 'col4': [12.0, 13.0, 14.0, 15.0], - 'col5': [0.0, 0.0, 0.0, 0.0]} - - pandas_df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - - testfuncs = [lambda x: x + 1, - lambda x: str(x), - lambda x: x * x, - lambda x: x, - lambda x: False] - - query_funcs = ['col1 < col2', 'col3 > col4', 'col1 == col2', - '(col2 > col1) and (col1 < col3)'] - - keys = ['col1', - 'col2', - 'col3', - 'col4'] - - filter_by = {'items': ['col1', 'col5'], - 'regex': '4$|3$', - 'like': 'col'} - - test_filter(ray_df, pandas_df, filter_by) - test_index(ray_df, pandas_df) - test_size(ray_df, pandas_df) - test_ndim(ray_df, pandas_df) - test_ftypes(ray_df, pandas_df) - test_dtypes(ray_df, pandas_df) - test_values(ray_df, pandas_df) - test_axes(ray_df, pandas_df) - test_shape(ray_df, pandas_df) - test_add_prefix(ray_df, pandas_df) - test_add_suffix(ray_df, pandas_df) - - for testfunc in testfuncs: - test_applymap(ray_df, pandas_df, testfunc) - - test_copy(ray_df) - test_sum(ray_df, pandas_df) - test_prod(ray_df, pandas_df) - test_product(ray_df, pandas_df) - test_abs(ray_df, pandas_df) - test_keys(ray_df, pandas_df) - test_transpose(ray_df, pandas_df) - test_round(ray_df, pandas_df) - test_query(ray_df, pandas_df, query_funcs) - - test_mean(ray_df, pandas_df) - # TODO Clear floating point error. - # test_var(ray_df, pandas_df) - test_std(ray_df, pandas_df) - test_median(ray_df, pandas_df) - - quantiles = [0.25, 0.5, 0.75, 0.66, 0.01] - - for q in quantiles: - test_quantile(ray_df, pandas_df, q) - - test_describe(ray_df, pandas_df) - test_diff(ray_df, pandas_df) - test_rank(ray_df, pandas_df) - - test_all(ray_df, pandas_df) - test_any(ray_df, pandas_df) - test___getitem__(ray_df, pandas_df) - test___neg__(ray_df, pandas_df) - test___iter__(ray_df, pandas_df) - test___abs__(ray_df, pandas_df) - test___delitem__(ray_df, pandas_df) - test___copy__(ray_df, pandas_df) - test___deepcopy__(ray_df, pandas_df) - test_bool(ray_df, pandas_df) - test_count(ray_df, pandas_df) - test_head(ray_df, pandas_df, 3) - test_head(ray_df, pandas_df) - test_tail(ray_df, pandas_df) - test_idxmax(ray_df, pandas_df) - test_idxmin(ray_df, pandas_df) - test_pop(ray_df, pandas_df) - test_max(ray_df, pandas_df) - test_min(ray_df, pandas_df) - test_notna(ray_df, pandas_df) - test_notnull(ray_df, pandas_df) - test_cummax(ray_df, pandas_df) - test_cummin(ray_df, pandas_df) - test_cumprod(ray_df, pandas_df) - test_cumsum(ray_df, pandas_df) - test_pipe(ray_df, pandas_df) - - test___len__(ray_df, pandas_df) - test_first_valid_index(ray_df, pandas_df) - test_last_valid_index(ray_df, pandas_df) - - for key in keys: - test_get(ray_df, pandas_df, key) - - test_get_dtype_counts(ray_df, pandas_df) - test_get_ftype_counts(ray_df, pandas_df) - test_iterrows(ray_df, pandas_df) - test_items(ray_df, pandas_df) - test_iteritems(ray_df, pandas_df) - test_itertuples(ray_df, pandas_df) - - # test_loc(ray_df, pandas_df) - # test_iloc(ray_df, pandas_df) - - labels = ['a', 'b', 'c', 'd'] - test_set_axis(ray_df, pandas_df, labels, 0) - test_set_axis(ray_df, pandas_df, labels, 'rows') - labels.append('e') - test_set_axis(ray_df, pandas_df, labels, 1) - test_set_axis(ray_df, pandas_df, labels, 'columns') - - for key in keys: - test_set_index(ray_df, pandas_df, key) - test_set_index(ray_df, pandas_df, key, inplace=True) - - test_reset_index(ray_df, pandas_df) - test_reset_index(ray_df, pandas_df, inplace=True) - - for key in keys: - test___contains__(ray_df, key, True) - test___contains__(ray_df, "Not Exists", False) - - for key in keys: - test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key]) - test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key]) - test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key]) - test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key]) - - # TODO Nans are always not equal to each other, fix it - # test___array__(ray_df, pandas_df) - - apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'], - ['sum', 'sum']] - for func in apply_agg_functions: - test_apply(ray_df, pandas_df, func, 0) - test_aggregate(ray_df, pandas_df, func, 0) - test_agg(ray_df, pandas_df, func, 0) - if not isinstance(func, list): - test_agg(ray_df, pandas_df, func, 1) - test_apply(ray_df, pandas_df, func, 1) - test_aggregate(ray_df, pandas_df, func, 1) - else: - with pytest.raises(TypeError): - test_agg(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_aggregate(ray_df, pandas_df, func, 1) - - func = ['sum', lambda df: df.sum()] - test_apply(ray_df, pandas_df, func, 0) - test_aggregate(ray_df, pandas_df, func, 0) - test_agg(ray_df, pandas_df, func, 0) - with pytest.raises(TypeError): - test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_aggregate(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_agg(ray_df, pandas_df, func, 1) - - test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1) - test_apply(ray_df, pandas_df, lambda df: -df, 0) - test_transform(ray_df, pandas_df) - - -def test_mixed_dtype_dataframe(): - frame_data = {'col1': [1, 2, 3, 4], - 'col2': [4, 5, 6, 7], - 'col3': [8.0, 9.4, 10.1, 11.3], - 'col4': ['a', 'b', 'c', 'd']} - - pandas_df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - - testfuncs = [lambda x: x + x, - lambda x: str(x), - lambda x: x, - lambda x: False] - - query_funcs = ['col1 < col2', 'col1 == col2', - '(col2 > col1) and (col1 < col3)'] - - keys = ['col1', - 'col2', - 'col3', - 'col4'] - - filter_by = {'items': ['col1', 'col5'], - 'regex': '4$|3$', - 'like': 'col'} - - test_filter(ray_df, pandas_df, filter_by) - test_index(ray_df, pandas_df) - test_size(ray_df, pandas_df) - test_ndim(ray_df, pandas_df) - test_ftypes(ray_df, pandas_df) - test_dtypes(ray_df, pandas_df) - test_values(ray_df, pandas_df) - test_axes(ray_df, pandas_df) - test_shape(ray_df, pandas_df) - test_add_prefix(ray_df, pandas_df) - test_add_suffix(ray_df, pandas_df) - - for testfunc in testfuncs: - test_applymap(ray_df, pandas_df, testfunc) - - test_copy(ray_df) - test_sum(ray_df, pandas_df) - - with pytest.raises(TypeError): - test_abs(ray_df, pandas_df) - test___abs__(ray_df, pandas_df) - - test_keys(ray_df, pandas_df) - test_transpose(ray_df, pandas_df) - test_round(ray_df, pandas_df) - test_query(ray_df, pandas_df, query_funcs) - - test_mean(ray_df, pandas_df) - # TODO Clear floating point error. - # test_var(ray_df, pandas_df) - test_std(ray_df, pandas_df) - test_median(ray_df, pandas_df) - - quantiles = [0.25, 0.5, 0.75, 0.66, 0.01] - - for q in quantiles: - test_quantile(ray_df, pandas_df, q) - - test_describe(ray_df, pandas_df) - - # TODO Reolve once Pandas-20962 is resolved. - # test_rank(ray_df, pandas_df) - - test_all(ray_df, pandas_df) - test_any(ray_df, pandas_df) - test___getitem__(ray_df, pandas_df) - - with pytest.raises(TypeError): - test___neg__(ray_df, pandas_df) - - test___iter__(ray_df, pandas_df) - test___delitem__(ray_df, pandas_df) - test___copy__(ray_df, pandas_df) - test___deepcopy__(ray_df, pandas_df) - test_bool(ray_df, pandas_df) - test_count(ray_df, pandas_df) - test_head(ray_df, pandas_df, 2) - test_head(ray_df, pandas_df) - test_tail(ray_df, pandas_df) - - with pytest.raises(TypeError): - test_idxmax(ray_df, pandas_df) - with pytest.raises(TypeError): - test_idxmin(ray_df, pandas_df) - - test_pop(ray_df, pandas_df) - test_max(ray_df, pandas_df) - test_min(ray_df, pandas_df) - test_notna(ray_df, pandas_df) - test_notnull(ray_df, pandas_df) - test_pipe(ray_df, pandas_df) - - # TODO Fix pandas so that the behavior is correct - # We discovered a bug where argmax does not always give the same result - # depending on what your other dtypes are. - # test_cummax(ray_df, pandas_df) - # test_cummin(ray_df, pandas_df) - # test_cumprod(ray_df, pandas_df) - # test_cumsum(ray_df, pandas_df) - - test___len__(ray_df, pandas_df) - test_first_valid_index(ray_df, pandas_df) - test_last_valid_index(ray_df, pandas_df) - - for key in keys: - test_get(ray_df, pandas_df, key) - - test_get_dtype_counts(ray_df, pandas_df) - test_get_ftype_counts(ray_df, pandas_df) - test_iterrows(ray_df, pandas_df) - test_items(ray_df, pandas_df) - test_iteritems(ray_df, pandas_df) - test_itertuples(ray_df, pandas_df) - - # test_loc(ray_df, pandas_df) - # test_iloc(ray_df, pandas_df) - - labels = ['a', 'b', 'c', 'd'] - test_set_axis(ray_df, pandas_df, labels, 0) - test_set_axis(ray_df, pandas_df, labels, 'rows') - test_set_axis(ray_df, pandas_df, labels, 1) - test_set_axis(ray_df, pandas_df, labels, 'columns') - - for key in keys: - test_set_index(ray_df, pandas_df, key) - test_set_index(ray_df, pandas_df, key, inplace=True) - - test_reset_index(ray_df, pandas_df) - test_reset_index(ray_df, pandas_df, inplace=True) - - for key in keys: - test___contains__(ray_df, key, True) - test___contains__(ray_df, "Not Exists", False) - - for key in keys: - test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key]) - test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key]) - test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key]) - test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key]) - - test___array__(ray_df, pandas_df) - - apply_agg_functions = ['sum', lambda df: df.sum()] - for func in apply_agg_functions: - test_apply(ray_df, pandas_df, func, 0) - test_aggregate(ray_df, pandas_df, func, 0) - test_agg(ray_df, pandas_df, func, 0) - - func = ['sum', lambda df: df.sum()] - test_apply(ray_df, pandas_df, func, 0) - test_aggregate(ray_df, pandas_df, func, 0) - test_agg(ray_df, pandas_df, func, 0) - with pytest.raises(TypeError): - test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_aggregate(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_agg(ray_df, pandas_df, func, 1) - - test_transform(ray_df, pandas_df) - test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1) - - -def test_nan_dataframe(): - frame_data = {'col1': [1, 2, 3, np.nan], - 'col2': [4, 5, np.nan, 7], - 'col3': [8, np.nan, 10, 11], - 'col4': [np.nan, 13, 14, 15]} - - pandas_df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - - testfuncs = [lambda x: x + x, - lambda x: str(x), - lambda x: x, - lambda x: False] - - query_funcs = ['col1 < col2', 'col3 > col4', 'col1 == col2', - '(col2 > col1) and (col1 < col3)'] - - keys = ['col1', - 'col2', - 'col3', - 'col4'] - - filter_by = {'items': ['col1', 'col5'], - 'regex': '4$|3$', - 'like': 'col'} - - test_filter(ray_df, pandas_df, filter_by) - test_index(ray_df, pandas_df) - test_size(ray_df, pandas_df) - test_ndim(ray_df, pandas_df) - test_ftypes(ray_df, pandas_df) - test_dtypes(ray_df, pandas_df) - test_values(ray_df, pandas_df) - test_axes(ray_df, pandas_df) - test_shape(ray_df, pandas_df) - test_add_prefix(ray_df, pandas_df) - test_add_suffix(ray_df, pandas_df) - - for testfunc in testfuncs: - test_applymap(ray_df, pandas_df, testfunc) - - test_copy(ray_df) - test_sum(ray_df, pandas_df) - test_abs(ray_df, pandas_df) - test_keys(ray_df, pandas_df) - test_transpose(ray_df, pandas_df) - test_round(ray_df, pandas_df) - test_query(ray_df, pandas_df, query_funcs) - - test_mean(ray_df, pandas_df) - test_var(ray_df, pandas_df) - test_std(ray_df, pandas_df) - test_median(ray_df, pandas_df) - - quantiles = [0.25, 0.5, 0.75, 0.66, 0.01] - - for q in quantiles: - test_quantile(ray_df, pandas_df, q) - - test_describe(ray_df, pandas_df) - test_diff(ray_df, pandas_df) - test_rank(ray_df, pandas_df) - - test_all(ray_df, pandas_df) - test_any(ray_df, pandas_df) - test___getitem__(ray_df, pandas_df) - test___neg__(ray_df, pandas_df) - test___iter__(ray_df, pandas_df) - test___abs__(ray_df, pandas_df) - test___delitem__(ray_df, pandas_df) - test___copy__(ray_df, pandas_df) - test___deepcopy__(ray_df, pandas_df) - test_bool(ray_df, pandas_df) - test_count(ray_df, pandas_df) - test_head(ray_df, pandas_df, 2) - test_head(ray_df, pandas_df) - test_tail(ray_df, pandas_df) - test_idxmax(ray_df, pandas_df) - test_idxmin(ray_df, pandas_df) - test_pop(ray_df, pandas_df) - test_max(ray_df, pandas_df) - test_min(ray_df, pandas_df) - test_notna(ray_df, pandas_df) - test_notnull(ray_df, pandas_df) - test_cummax(ray_df, pandas_df) - test_cummin(ray_df, pandas_df) - test_cumprod(ray_df, pandas_df) - test_cumsum(ray_df, pandas_df) - test_pipe(ray_df, pandas_df) - - test___len__(ray_df, pandas_df) - test_first_valid_index(ray_df, pandas_df) - test_last_valid_index(ray_df, pandas_df) - - for key in keys: - test_get(ray_df, pandas_df, key) - - test_get_dtype_counts(ray_df, pandas_df) - test_get_ftype_counts(ray_df, pandas_df) - test_iterrows(ray_df, pandas_df) - test_items(ray_df, pandas_df) - test_iteritems(ray_df, pandas_df) - test_itertuples(ray_df, pandas_df) - - # test_loc(ray_df, pandas_df) - # test_iloc(ray_df, pandas_df) - - labels = ['a', 'b', 'c', 'd'] - test_set_axis(ray_df, pandas_df, labels, 0) - test_set_axis(ray_df, pandas_df, labels, 'rows') - test_set_axis(ray_df, pandas_df, labels, 1) - test_set_axis(ray_df, pandas_df, labels, 'columns') - - for key in keys: - test_set_index(ray_df, pandas_df, key) - test_set_index(ray_df, pandas_df, key, inplace=True) - - test_reset_index(ray_df, pandas_df) - test_reset_index(ray_df, pandas_df, inplace=True) - - for key in keys: - test___contains__(ray_df, key, True) - test___contains__(ray_df, "Not Exists", False) - - for key in keys: - test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key]) - test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key]) - test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key]) - test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key]) - - # TODO Nans are always not equal to each other, fix it - # test___array__(ray_df, pandas_df) - - apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'], - ['sum', 'sum']] - for func in apply_agg_functions: - test_apply(ray_df, pandas_df, func, 0) - test_aggregate(ray_df, pandas_df, func, 0) - test_agg(ray_df, pandas_df, func, 0) - if not isinstance(func, list): - test_agg(ray_df, pandas_df, func, 1) - test_apply(ray_df, pandas_df, func, 1) - test_aggregate(ray_df, pandas_df, func, 1) - else: - with pytest.raises(TypeError): - test_agg(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_aggregate(ray_df, pandas_df, func, 1) - - func = ['sum', lambda df: df.sum()] - test_apply(ray_df, pandas_df, func, 0) - test_aggregate(ray_df, pandas_df, func, 0) - test_agg(ray_df, pandas_df, func, 0) - with pytest.raises(TypeError): - test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_aggregate(ray_df, pandas_df, func, 1) - with pytest.raises(TypeError): - test_agg(ray_df, pandas_df, func, 1) - - test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1) - test_apply(ray_df, pandas_df, lambda df: -df, 0) - test_transform(ray_df, pandas_df) - - -def test_empty_df(): - df = pd.DataFrame(index=['a', 'b']) - test_is_empty(df) - tm.assert_index_equal(df.index, pd.Index(['a', 'b'])) - assert len(df.columns) == 0 - - df = pd.DataFrame(columns=['a', 'b']) - test_is_empty(df) - assert len(df.index) == 0 - tm.assert_index_equal(df.columns, pd.Index(['a', 'b'])) - - df = pd.DataFrame() - test_is_empty(df) - assert len(df.index) == 0 - assert len(df.columns) == 0 - - df = pd.DataFrame(index=['a', 'b']) - test_is_empty(df) - tm.assert_index_equal(df.index, pd.Index(['a', 'b'])) - assert len(df.columns) == 0 - - df = pd.DataFrame(columns=['a', 'b']) - test_is_empty(df) - assert len(df.index) == 0 - tm.assert_index_equal(df.columns, pd.Index(['a', 'b'])) - - df = pd.DataFrame() - test_is_empty(df) - assert len(df.index) == 0 - assert len(df.columns) == 0 - - -@pytest.fixture -def test_is_empty(df): - assert df.size == 0 and df.empty - assert df.shape[0] == 0 or df.shape[1] == 0 - - -def test_dense_nan_df(): - frame_data = [[np.nan, 2, np.nan, 0], - [3, 4, np.nan, 1], - [np.nan, np.nan, np.nan, 5]] - ray_df = pd.DataFrame(frame_data, columns=list('ABCD')) - - pd_df = pandas.DataFrame(frame_data, columns=list('ABCD')) - - column_subsets = [list('AD'), list('BC'), list('CD')] - row_subsets = [[0, 1], [0, 1, 2], [2, 0]] - - test_dropna(ray_df, pd_df) - test_dropna_inplace(ray_df, pd_df) - test_dropna_multiple_axes(ray_df, pd_df) - test_dropna_multiple_axes_inplace(ray_df, pd_df) - test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets) - test_dropna_subset_error(ray_df) - - -@pytest.fixture -def test_inter_df_math(op, simple=False): - frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} - - ray_df = pd.DataFrame(frame_data) - pandas_df = pandas.DataFrame(frame_data) - - assert ray_df_equals_pandas(getattr(ray_df, op)(ray_df), - getattr(pandas_df, op)(pandas_df)) - assert ray_df_equals_pandas(getattr(ray_df, op)(4), - getattr(pandas_df, op)(4)) - assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), - getattr(pandas_df, op)(4.0)) - - frame_data = {"A": [0, 2], "col1": [0, 19], "col2": [1, 1]} - ray_df2 = pd.DataFrame(frame_data) - pandas_df2 = pandas.DataFrame(frame_data) - - assert ray_df_equals_pandas(getattr(ray_df, op)(ray_df2), - getattr(pandas_df, op)(pandas_df2)) - - list_test = [0, 1, 2, 4] - - if not simple: - assert ray_df_equals_pandas(getattr(ray_df, op)(list_test, axis=1), - getattr(pandas_df, op)(list_test, axis=1)) - - assert ray_df_equals_pandas(getattr(ray_df, op)(list_test, axis=0), - getattr(pandas_df, op)(list_test, axis=0)) - - -@pytest.fixture -def test_comparison_inter_ops(op): - frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} - - ray_df = pd.DataFrame(frame_data) - pandas_df = pandas.DataFrame(frame_data) - - assert ray_df_equals_pandas(getattr(ray_df, op)(ray_df), - getattr(pandas_df, op)(pandas_df)) - assert ray_df_equals_pandas(getattr(ray_df, op)(4), - getattr(pandas_df, op)(4)) - assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), - getattr(pandas_df, op)(4.0)) - - frame_data = {"A": [0, 2], "col1": [0, 19], "col2": [1, 1]} - - ray_df2 = pd.DataFrame(frame_data) - pandas_df2 = pandas.DataFrame(frame_data) - - assert ray_df_equals_pandas(getattr(ray_df2, op)(ray_df2), - getattr(pandas_df2, op)(pandas_df2)) - - -@pytest.fixture -def test_inter_df_math_right_ops(op): - frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} - - ray_df = pd.DataFrame(frame_data) - pandas_df = pandas.DataFrame(frame_data) - - assert ray_df_equals_pandas(getattr(ray_df, op)(4), - getattr(pandas_df, op)(4)) - assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), - getattr(pandas_df, op)(4.0)) - - -@pytest.fixture -def test_index(ray_df, pandas_df): - assert ray_df.index.equals(pandas_df.index) - ray_df_cp = ray_df.copy() - pandas_df_cp = pandas_df.copy() - - ray_df_cp.index = [str(i) for i in ray_df_cp.index] - pandas_df_cp.index = [str(i) for i in pandas_df_cp.index] - assert ray_df_cp.index.equals(pandas_df_cp.index) - - -@pytest.fixture -def test_size(ray_df, pandas_df): - assert ray_df.size == pandas_df.size - - -@pytest.fixture -def test_ndim(ray_df, pandas_df): - assert ray_df.ndim == pandas_df.ndim - - -@pytest.fixture -def test_ftypes(ray_df, pandas_df): - assert ray_df.ftypes.equals(pandas_df.ftypes) - - -@pytest.fixture -def test_dtypes(ray_df, pandas_df): - assert ray_df.dtypes.equals(pandas_df.dtypes) - - -@pytest.fixture -def test_values(ray_df, pandas_df): - np.testing.assert_equal(ray_df.values, pandas_df.values) - - -@pytest.fixture -def test_axes(ray_df, pandas_df): - for ray_axis, pd_axis in zip(ray_df.axes, pandas_df.axes): - assert (np.array_equal(ray_axis, pd_axis)) - - -@pytest.fixture -def test_shape(ray_df, pandas_df): - assert ray_df.shape == pandas_df.shape - - -@pytest.fixture -def test_add_prefix(ray_df, pandas_df): - test_prefix = "TEST" - new_ray_df = ray_df.add_prefix(test_prefix) - new_pandas_df = pandas_df.add_prefix(test_prefix) - assert new_ray_df.columns.equals(new_pandas_df.columns) - - -@pytest.fixture -def test_add_suffix(ray_df, pandas_df): - test_suffix = "TEST" - new_ray_df = ray_df.add_suffix(test_suffix) - new_pandas_df = pandas_df.add_suffix(test_suffix) - - assert new_ray_df.columns.equals(new_pandas_df.columns) - - -@pytest.fixture -def test_applymap(ray_df, pandas_df, testfunc): - new_ray_df = ray_df.applymap(testfunc) - new_pandas_df = pandas_df.applymap(testfunc) - - assert ray_df_equals_pandas(new_ray_df, new_pandas_df) - - -@pytest.fixture -def test_copy(ray_df): - new_ray_df = ray_df.copy() - - assert new_ray_df is not ray_df - assert np.array_equal(new_ray_df._block_partitions, - ray_df._block_partitions) - - -@pytest.fixture -def test_sum(ray_df, pandas_df): - assert ray_df.sum().equals(pandas_df.sum()) - - -@pytest.fixture -def test_abs(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.abs(), pandas_df.abs()) - - -@pytest.fixture -def test_keys(ray_df, pandas_df): - assert ray_df.keys().equals(pandas_df.keys()) - - -@pytest.fixture -def test_transpose(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.T, pandas_df.T) - assert ray_df_equals_pandas(ray_df.transpose(), pandas_df.transpose()) - - -@pytest.fixture -def test_get(ray_df, pandas_df, key): - assert ray_df.get(key).equals(pandas_df.get(key)) - assert ray_df.get( - key, default='default').equals(pandas_df.get(key, default='default')) - - -@pytest.fixture -def test_get_dtype_counts(ray_df, pandas_df): - assert ray_df.get_dtype_counts().equals(pandas_df.get_dtype_counts()) - - -@pytest.fixture -def test_get_ftype_counts(ray_df, pandas_df): - assert ray_df.get_ftype_counts().equals(pandas_df.get_ftype_counts()) - - -def test_add(): - test_inter_df_math("add", simple=False) - - -@pytest.fixture -def test_agg(ray_df, pandas_df, func, axis): - ray_result = ray_df.agg(func, axis) - pandas_result = pandas_df.agg(func, axis) - if isinstance(ray_result, pd.DataFrame): - assert ray_df_equals_pandas(ray_result, pandas_result) - else: - assert ray_result.equals(pandas_result) - - -@pytest.fixture -def test_aggregate(ray_df, pandas_df, func, axis): - ray_result = ray_df.aggregate(func, axis) - pandas_result = pandas_df.aggregate(func, axis) - if isinstance(ray_result, pd.DataFrame): - assert ray_df_equals_pandas(ray_result, pandas_result) - else: - assert ray_result.equals(pandas_result) - - -def test_align(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.align(None) - - -@pytest.fixture -def test_all(ray_df, pd_df): - assert pd_df.all().equals(ray_df.all()) - assert pd_df.all(axis=1).equals(ray_df.all(axis=1)) - - -@pytest.fixture -def test_any(ray_df, pd_df): - assert pd_df.any().equals(ray_df.any()) - assert pd_df.any(axis=1).equals(ray_df.any(axis=1)) - - -def test_append(): - frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} - - ray_df = pd.DataFrame(frame_data) - pandas_df = pandas.DataFrame(frame_data) - - frame_data2 = {"col5": [0], "col6": [1]} - - ray_df2 = pd.DataFrame(frame_data2) - pandas_df2 = pandas.DataFrame(frame_data2) - - assert ray_df_equals_pandas(ray_df.append(ray_df2), - pandas_df.append(pandas_df2)) - - with pytest.raises(ValueError): - ray_df.append(ray_df2, verify_integrity=True) - - -@pytest.fixture -def test_apply(ray_df, pandas_df, func, axis): - ray_result = ray_df.apply(func, axis) - pandas_result = pandas_df.apply(func, axis) - if isinstance(ray_result, pd.DataFrame): - assert ray_df_equals_pandas(ray_result, pandas_result) - else: - assert ray_result.equals(pandas_result) - - -def test_as_blocks(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.as_blocks() - - -def test_as_matrix(): - test_data = TestData() - frame = pd.DataFrame(test_data.frame) - mat = frame.as_matrix() - - frame_columns = frame.columns - for i, row in enumerate(mat): - for j, value in enumerate(row): - col = frame_columns[j] - if np.isnan(value): - assert np.isnan(frame[col][i]) - else: - assert value == frame[col][i] - - # mixed type - mat = pd.DataFrame(test_data.mixed_frame).as_matrix(['foo', 'A']) - assert mat[0, 0] == 'bar' - - df = pd.DataFrame({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]}) - mat = df.as_matrix() - assert mat[0, 0] == 1j - - # single block corner case - mat = pd.DataFrame(test_data.frame).as_matrix(['A', 'B']) - expected = test_data.frame.reindex(columns=['A', 'B']).values - tm.assert_almost_equal(mat, expected) - - -def test_asfreq(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.asfreq(None) - - -def test_asof(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.asof(None) - - -def test_assign(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.assign() - - -def test_astype(): - td = TestData() - ray_df = pd.DataFrame(td.frame) - our_df_casted = ray_df.astype(np.int32) - expected_df_casted = pandas.DataFrame(td.frame.values.astype(np.int32), - index=td.frame.index, - columns=td.frame.columns) - - assert ray_df_equals_pandas(our_df_casted, expected_df_casted) - - our_df_casted = ray_df.astype(np.float64) - expected_df_casted = pandas.DataFrame(td.frame.values.astype(np.float64), - index=td.frame.index, - columns=td.frame.columns) - - assert ray_df_equals_pandas(our_df_casted, expected_df_casted) - - our_df_casted = ray_df.astype(str) - expected_df_casted = pandas.DataFrame(td.frame.values.astype(str), - index=td.frame.index, - columns=td.frame.columns) - - assert ray_df_equals_pandas(our_df_casted, expected_df_casted) - - -def test_at_time(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.at_time(None) - - -def test_between_time(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.between_time(None, None) - - -@pytest.fixture -def test_bfill(): - test_data = TestData() - test_data.tsframe['A'][:5] = np.nan - test_data.tsframe['A'][-5:] = np.nan - ray_df = pd.DataFrame(test_data.tsframe) - assert ray_df_equals_pandas(ray_df.bfill(), test_data.tsframe.bfill()) - - -@pytest.fixture -def test_bool(ray_df, pd_df): - with pytest.raises(ValueError): - ray_df.bool() - pd_df.bool() - - single_bool_pd_df = pandas.DataFrame([True]) - single_bool_ray_df = pd.DataFrame([True]) - - assert single_bool_pd_df.bool() == single_bool_ray_df.bool() - - -def test_boxplot(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.boxplot() - - -def test_clip(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.clip() - - -def test_clip_lower(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.clip_lower(None) - - -def test_clip_upper(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.clip_upper(None) - - -def test_combine(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.combine(None, None) - - -def test_combine_first(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.combine_first(None) - - -def test_compound(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.compound() - - -def test_consolidate(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.consolidate() - - -def test_convert_objects(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.convert_objects() - - -def test_corr(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.corr() - - -def test_corrwith(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.corrwith(None) - - -@pytest.fixture -def test_count(ray_df, pd_df): - assert ray_df.count().equals(pd_df.count()) - assert ray_df.count(axis=1).equals(pd_df.count(axis=1)) - - -def test_cov(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.cov() - - -@pytest.fixture -def test_cummax(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.cummax(), pandas_df.cummax()) - - -@pytest.fixture -def test_cummin(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.cummin(), pandas_df.cummin()) - - -@pytest.fixture -def test_cumprod(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.cumprod(), pandas_df.cumprod()) - - -@pytest.fixture -def test_cumsum(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.cumsum(), pandas_df.cumsum()) - - -@pytest.fixture -def test_describe(ray_df, pandas_df): - assert ray_df.describe().equals(pandas_df.describe()) - - -@pytest.fixture -def test_diff(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.diff(), pandas_df.diff()) - assert ray_df_equals_pandas(ray_df.diff(axis=1), pandas_df.diff(axis=1)) - assert ray_df_equals_pandas(ray_df.diff(periods=1), - pandas_df.diff(periods=1)) - - -def test_div(): - test_inter_df_math("div", simple=False) - - -def test_divide(): - test_inter_df_math("divide", simple=False) - - -def test_dot(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.dot(None) - - -def test_drop(): - frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]} - simple = pandas.DataFrame(frame_data) - ray_simple = pd.DataFrame(frame_data) - assert ray_df_equals_pandas(ray_simple.drop("A", axis=1), simple[['B']]) - assert ray_df_equals_pandas(ray_simple.drop(["A", "B"], axis='columns'), - simple[[]]) - assert ray_df_equals_pandas(ray_simple.drop([0, 1, 3], axis=0), - simple.loc[[2], :]) - assert ray_df_equals_pandas(ray_simple.drop([0, 3], axis='index'), - simple.loc[[1, 2], :]) - - pytest.raises(ValueError, ray_simple.drop, 5) - pytest.raises(ValueError, ray_simple.drop, 'C', 1) - pytest.raises(ValueError, ray_simple.drop, [1, 5]) - pytest.raises(ValueError, ray_simple.drop, ['A', 'C'], 1) - - # errors = 'ignore' - assert ray_df_equals_pandas(ray_simple.drop(5, errors='ignore'), simple) - assert ray_df_equals_pandas(ray_simple.drop([0, 5], errors='ignore'), - simple.loc[[1, 2, 3], :]) - assert ray_df_equals_pandas(ray_simple.drop('C', axis=1, errors='ignore'), - simple) - assert ray_df_equals_pandas(ray_simple.drop(['A', 'C'], axis=1, - errors='ignore'), - simple[['B']]) - - # non-unique - nu_df = pandas.DataFrame(pandas.compat.lzip(range(3), range(-3, 1), - list('abc')), - columns=['a', 'a', 'b']) - ray_nu_df = pd.DataFrame(nu_df) - assert ray_df_equals_pandas(ray_nu_df.drop('a', axis=1), nu_df[['b']]) - assert ray_df_equals_pandas(ray_nu_df.drop('b', axis='columns'), - nu_df['a']) - assert ray_df_equals_pandas(ray_nu_df.drop([]), nu_df) - - nu_df = nu_df.set_index(pandas.Index(['X', 'Y', 'X'])) - nu_df.columns = list('abc') - ray_nu_df = pd.DataFrame(nu_df) - assert ray_df_equals_pandas(ray_nu_df.drop('X', axis='rows'), - nu_df.loc[["Y"], :]) - assert ray_df_equals_pandas(ray_nu_df.drop(['X', 'Y'], axis=0), - nu_df.loc[[], :]) - - # inplace cache issue - frame_data = np.random.randn(10, 3) - df = pandas.DataFrame(frame_data, columns=list('abc')) - ray_df = pd.DataFrame(frame_data, columns=list('abc')) - expected = df[~(df.b > 0)] - ray_df.drop(labels=df[df.b > 0].index, inplace=True) - assert ray_df_equals_pandas(ray_df, expected) - - -def test_drop_api_equivalence(): - # equivalence of the labels/axis and index/columns API's - frame_data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]] - - ray_df = pd.DataFrame(frame_data, index=['a', 'b', 'c'], - columns=['d', 'e', 'f']) - - ray_df1 = ray_df.drop('a') - ray_df2 = ray_df.drop(index='a') - assert ray_df_equals(ray_df1, ray_df2) - - ray_df1 = ray_df.drop('d', 1) - ray_df2 = ray_df.drop(columns='d') - assert ray_df_equals(ray_df1, ray_df2) - - ray_df1 = ray_df.drop(labels='e', axis=1) - ray_df2 = ray_df.drop(columns='e') - assert ray_df_equals(ray_df1, ray_df2) - - ray_df1 = ray_df.drop(['a'], axis=0) - ray_df2 = ray_df.drop(index=['a']) - assert ray_df_equals(ray_df1, ray_df2) - - ray_df1 = ray_df.drop(['a'], axis=0).drop(['d'], axis=1) - ray_df2 = ray_df.drop(index=['a'], columns=['d']) - assert ray_df_equals(ray_df1, ray_df2) - - with pytest.raises(ValueError): - ray_df.drop(labels='a', index='b') - - with pytest.raises(ValueError): - ray_df.drop(labels='a', columns='b') - - with pytest.raises(ValueError): - ray_df.drop(axis=1) - - -def test_drop_duplicates(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.drop_duplicates() - - -@pytest.fixture -def test_dropna(ray_df, pd_df): - assert ray_df_equals_pandas(ray_df.dropna(axis=1, how='all'), - pd_df.dropna(axis=1, how='all')) - - assert ray_df_equals_pandas(ray_df.dropna(axis=1, how='any'), - pd_df.dropna(axis=1, how='any')) - - assert ray_df_equals_pandas(ray_df.dropna(axis=0, how='all'), - pd_df.dropna(axis=0, how='all')) - - assert ray_df_equals_pandas(ray_df.dropna(thresh=2), - pd_df.dropna(thresh=2)) - - -@pytest.fixture -def test_dropna_inplace(ray_df, pd_df): - ray_df = ray_df.copy() - pd_df = pd_df.copy() - - ray_df.dropna(thresh=2, inplace=True) - pd_df.dropna(thresh=2, inplace=True) - - assert ray_df_equals_pandas(ray_df, pd_df) - - ray_df.dropna(axis=1, how='any', inplace=True) - pd_df.dropna(axis=1, how='any', inplace=True) - - assert ray_df_equals_pandas(ray_df, pd_df) - - -@pytest.fixture -def test_dropna_multiple_axes(ray_df, pd_df): - assert ray_df_equals_pandas(ray_df.dropna(how='all', axis=[0, 1]), - pd_df.dropna(how='all', axis=[0, 1])) - assert ray_df_equals_pandas(ray_df.dropna(how='all', axis=(0, 1)), - pd_df.dropna(how='all', axis=(0, 1))) - - -@pytest.fixture -def test_dropna_multiple_axes_inplace(ray_df, pd_df): - ray_df_copy = ray_df.copy() - pd_df_copy = pd_df.copy() - - ray_df_copy.dropna(how='all', axis=[0, 1], inplace=True) - pd_df_copy.dropna(how='all', axis=[0, 1], inplace=True) - - assert ray_df_equals_pandas(ray_df_copy, pd_df_copy) - - ray_df_copy = ray_df.copy() - pd_df_copy = pd_df.copy() - - ray_df_copy.dropna(how='all', axis=(0, 1), inplace=True) - pd_df_copy.dropna(how='all', axis=(0, 1), inplace=True) - - assert ray_df_equals_pandas(ray_df_copy, pd_df_copy) - - -@pytest.fixture -def test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets): - for subset in column_subsets: - assert ray_df_equals_pandas(ray_df.dropna(how='all', subset=subset), - pd_df.dropna(how='all', subset=subset)) - - assert ray_df_equals_pandas(ray_df.dropna(how='any', subset=subset), - pd_df.dropna(how='any', subset=subset)) - - for subset in row_subsets: - assert ray_df_equals_pandas( - ray_df.dropna(how='all', axis=1, subset=subset), - pd_df.dropna(how='all', axis=1, subset=subset)) - - assert ray_df_equals_pandas( - ray_df.dropna(how='any', axis=1, subset=subset), - pd_df.dropna(how='any', axis=1, subset=subset)) - - -@pytest.fixture -def test_dropna_subset_error(ray_df): - with pytest.raises(KeyError): - ray_df.dropna(subset=list('EF')) - - with pytest.raises(KeyError): - ray_df.dropna(axis=1, subset=[4, 5]) - - -def test_duplicated(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.duplicated() - - -def test_eq(): - test_comparison_inter_ops("eq") - - -def test_equals(): - frame_data = {'col1': [2.9, 3, 3, 3], 'col2': [2, 3, 4, 1]} - ray_df1 = pd.DataFrame(frame_data) - ray_df2 = pd.DataFrame(frame_data) - - assert ray_df1.equals(ray_df2) - - frame_data = {'col1': [2.9, 3, 3, 3], 'col2': [2, 3, 5, 1]} - ray_df3 = pd.DataFrame(frame_data) - - assert not ray_df3.equals(ray_df1) - assert not ray_df3.equals(ray_df2) - - -def test_eval_df_use_case(): - frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)} - df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - df.eval("e = arctan2(sin(a), b)", - engine='python', parser='pandas', inplace=True) - ray_df.eval("e = arctan2(sin(a), b)", - engine='python', - parser='pandas', inplace=True) - # TODO: Use a series equality validator. - assert ray_df_equals_pandas(ray_df, df) - - -def test_eval_df_arithmetic_subexpression(): - frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)} - df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - df.eval("not_e = sin(a + b)", - engine='python', parser='pandas', inplace=True) - ray_df.eval("not_e = sin(a + b)", - engine='python', parser='pandas', inplace=True) - # TODO: Use a series equality validator. - assert ray_df_equals_pandas(ray_df, df) - - -def test_ewm(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.ewm() - - -def test_expanding(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.expanding() - - -@pytest.fixture -def test_ffill(): - test_data = TestData() - test_data.tsframe['A'][:5] = np.nan - test_data.tsframe['A'][-5:] = np.nan - ray_df = pd.DataFrame(test_data.tsframe) - - assert ray_df_equals_pandas(ray_df.ffill(), test_data.tsframe.ffill()) - - -def test_fillna(): - test_fillna_sanity() - test_fillna_downcast() - test_ffill() - test_ffill2() - test_bfill() - test_bfill2() - test_fillna_inplace() - # test_frame_fillna_limit() - # test_frame_pad_backfill_limit() - test_fillna_dtype_conversion() - test_fillna_skip_certain_blocks() - test_fillna_dict_series() - - with pytest.raises(NotImplementedError): - test_fillna_dataframe() - - test_fillna_columns() - test_fillna_invalid_method() - test_fillna_invalid_value() - test_fillna_col_reordering() - - -@pytest.fixture -def test_fillna_sanity(): - test_data = TestData() - tf = test_data.tsframe - tf.loc[tf.index[:5], 'A'] = np.nan - tf.loc[tf.index[-5:], 'A'] = np.nan - - zero_filled = test_data.tsframe.fillna(0) - ray_df = pd.DataFrame(test_data.tsframe).fillna(0) - assert ray_df_equals_pandas(ray_df, zero_filled) - - padded = test_data.tsframe.fillna(method='pad') - ray_df = pd.DataFrame(test_data.tsframe).fillna(method='pad') - assert ray_df_equals_pandas(ray_df, padded) - - # mixed type - mf = test_data.mixed_frame - mf.loc[mf.index[5:20], 'foo'] = np.nan - mf.loc[mf.index[-10:], 'A'] = np.nan - - result = test_data.mixed_frame.fillna(value=0) - ray_df = pd.DataFrame(test_data.mixed_frame).fillna(value=0) - assert ray_df_equals_pandas(ray_df, result) - - result = test_data.mixed_frame.fillna(method='pad') - ray_df = pd.DataFrame(test_data.mixed_frame).fillna(method='pad') - assert ray_df_equals_pandas(ray_df, result) - - pytest.raises(ValueError, test_data.tsframe.fillna) - pytest.raises(ValueError, pd.DataFrame(test_data.tsframe).fillna) - with pytest.raises(ValueError): - pd.DataFrame(test_data.tsframe).fillna(5, method='ffill') - - # mixed numeric (but no float16) - mf = test_data.mixed_float.reindex(columns=['A', 'B', 'D']) - mf.loc[mf.index[-10:], 'A'] = np.nan - result = mf.fillna(value=0) - ray_df = pd.DataFrame(mf).fillna(value=0) - assert ray_df_equals_pandas(ray_df, result) - - result = mf.fillna(method='pad') - ray_df = pd.DataFrame(mf).fillna(method='pad') - assert ray_df_equals_pandas(ray_df, result) - - # TODO: Use this when Arrow issue resolves: - # (https://issues.apache.org/jira/browse/ARROW-2122) - # empty frame - # df = DataFrame(columns=['x']) - # for m in ['pad', 'backfill']: - # df.x.fillna(method=m, inplace=True) - # df.x.fillna(method=m) - - # with different dtype - frame_data = [['a', 'a', np.nan, 'a'], - ['b', 'b', np.nan, 'b'], - ['c', 'c', np.nan, 'c']] - df = pandas.DataFrame(frame_data) - - result = df.fillna({2: 'foo'}) - ray_df = pd.DataFrame(frame_data).fillna({2: 'foo'}) - - assert ray_df_equals_pandas(ray_df, result) - - ray_df = pd.DataFrame(df) - df.fillna({2: 'foo'}, inplace=True) - ray_df.fillna({2: 'foo'}, inplace=True) - assert ray_df_equals_pandas(ray_df, result) - - frame_data = {'Date': [pandas.NaT, pandas.Timestamp("2014-1-1")], - 'Date2': [pandas.Timestamp("2013-1-1"), pandas.NaT]} - df = pandas.DataFrame(frame_data) - result = df.fillna(value={'Date': df['Date2']}) - ray_df = pd.DataFrame(frame_data).fillna(value={'Date': df['Date2']}) - assert ray_df_equals_pandas(ray_df, result) - - # TODO: Use this when Arrow issue resolves: - # (https://issues.apache.org/jira/browse/ARROW-2122) - # with timezone - """ - frame_data = {'A': [pandas.Timestamp('2012-11-11 00:00:00+01:00'), - pandas.NaT]} - df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - assert ray_df_equals_pandas(ray_df.fillna(method='pad'), - df.fillna(method='pad')) - - frame_data = {'A': [pandas.NaT, - pandas.Timestamp('2012-11-11 00:00:00+01:00')]} - df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data).fillna(method='bfill') - assert ray_df_equals_pandas(ray_df, df.fillna(method='bfill')) - """ - - -@pytest.fixture -def test_fillna_downcast(): - # infer int64 from float64 - frame_data = {'a': [1., np.nan]} - df = pandas.DataFrame(frame_data) - result = df.fillna(0, downcast='infer') - ray_df = pd.DataFrame(frame_data).fillna(0, downcast='infer') - assert ray_df_equals_pandas(ray_df, result) - - # infer int64 from float64 when fillna value is a dict - df = pandas.DataFrame(frame_data) - result = df.fillna({'a': 0}, downcast='infer') - ray_df = pd.DataFrame(frame_data).fillna({'a': 0}, downcast='infer') - assert ray_df_equals_pandas(ray_df, result) - - -@pytest.fixture -def test_ffill2(): - test_data = TestData() - test_data.tsframe['A'][:5] = np.nan - test_data.tsframe['A'][-5:] = np.nan - ray_df = pd.DataFrame(test_data.tsframe) - assert ray_df_equals_pandas(ray_df.fillna(method='ffill'), - test_data.tsframe.fillna(method='ffill')) - - -@pytest.fixture -def test_bfill2(): - test_data = TestData() - test_data.tsframe['A'][:5] = np.nan - test_data.tsframe['A'][-5:] = np.nan - ray_df = pd.DataFrame(test_data.tsframe) - assert ray_df_equals_pandas(ray_df.fillna(method='bfill'), - test_data.tsframe.fillna(method='bfill')) - - -@pytest.fixture -def test_fillna_inplace(): - frame_data = np.random.randn(10, 4) - df = pandas.DataFrame(frame_data) - df[1][:4] = np.nan - df[3][-4:] = np.nan - - ray_df = pd.DataFrame(df) - df.fillna(value=0, inplace=True) - assert not ray_df_equals_pandas(ray_df, df) - - ray_df.fillna(value=0, inplace=True) - assert ray_df_equals_pandas(ray_df, df) - - ray_df = pd.DataFrame(df).fillna(value={0: 0}, inplace=True) - assert ray_df is None - - df[1][:4] = np.nan - df[3][-4:] = np.nan - ray_df = pd.DataFrame(df) - df.fillna(method='ffill', inplace=True) - - assert not ray_df_equals_pandas(ray_df, df) - - ray_df.fillna(method='ffill', inplace=True) - assert ray_df_equals_pandas(ray_df, df) - - -@pytest.fixture -def test_frame_fillna_limit(): - index = np.arange(10) - frame_data = np.random.randn(10, 4) - df = pandas.DataFrame(frame_data, index=index) - - expected = df[:2].reindex(index) - expected = expected.fillna(method='pad', limit=5) - - ray_df = pd.DataFrame(df[:2].reindex(index)).fillna(method='pad', limit=5) - assert ray_df_equals_pandas(ray_df, expected) - - expected = df[-2:].reindex(index) - expected = expected.fillna(method='backfill', limit=5) - ray_df = pd.DataFrame(df[-2:].reindex(index)).fillna(method='backfill', - limit=5) - assert ray_df_equals_pandas(ray_df, expected) - - -@pytest.fixture -def test_frame_pad_backfill_limit(): - index = np.arange(10) - frame_data = np.random.randn(10, 4) - df = pandas.DataFrame(frame_data, index=index) - - result = df[:2].reindex(index) - ray_df = pd.DataFrame(result) - assert ray_df_equals_pandas(ray_df.fillna(method='pad', limit=5), - result.fillna(method='pad', limit=5)) - - result = df[-2:].reindex(index) - ray_df = pd.DataFrame(result) - assert ray_df_equals_pandas(ray_df.fillna(method='backfill', limit=5), - result.fillna(method='backfill', limit=5)) - - -@pytest.fixture -def test_fillna_dtype_conversion(): - # make sure that fillna on an empty frame works - df = pandas.DataFrame(index=range(3), columns=['A', 'B'], dtype='float64') - ray_df = pd.DataFrame(index=range(3), columns=['A', 'B'], dtype='float64') - assert ray_df_equals_pandas(ray_df.fillna('nan'), df.fillna('nan')) - - frame_data = {'A': [1, np.nan], 'B': [1., 2.]} - df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - for v in ['', 1, np.nan, 1.0]: - assert ray_df_equals_pandas(ray_df.fillna(v), df.fillna(v)) - - -@pytest.fixture -def test_fillna_skip_certain_blocks(): - # don't try to fill boolean, int blocks - - frame_data = np.random.randn(10, 4).astype(int) - df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - - assert ray_df_equals_pandas(ray_df.fillna(np.nan), df.fillna(np.nan)) - - -@pytest.fixture -def test_fillna_dict_series(): - frame_data = {'a': [np.nan, 1, 2, np.nan, np.nan], - 'b': [1, 2, 3, np.nan, np.nan], - 'c': [np.nan, 1, 2, 3, 4]} - df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - - assert ray_df_equals_pandas(ray_df.fillna({'a': 0, 'b': 5}), - df.fillna({'a': 0, 'b': 5})) - - assert ray_df_equals_pandas(ray_df.fillna({'a': 0, 'b': 5, 'd': 7}), - df.fillna({'a': 0, 'b': 5, 'd': 7})) - - # Series treated same as dict - assert ray_df_equals_pandas(ray_df.fillna(df.max()), df.fillna(df.max())) - - -@pytest.fixture -def test_fillna_dataframe(): - frame_data = {'a': [np.nan, 1, 2, np.nan, np.nan], - 'b': [1, 2, 3, np.nan, np.nan], - 'c': [np.nan, 1, 2, 3, 4]} - df = pandas.DataFrame(frame_data, index=list('VWXYZ')) - ray_df = pd.DataFrame(frame_data, index=list('VWXYZ')) - - # df2 may have different index and columns - df2 = pandas.DataFrame({'a': [np.nan, 10, 20, 30, 40], - 'b': [50, 60, 70, 80, 90], - 'foo': ['bar'] * 5}, - index=list('VWXuZ')) - - # only those columns and indices which are shared get filled - assert ray_df_equals_pandas(ray_df.fillna(df2), df.fillna(df2)) - - -@pytest.fixture -def test_fillna_columns(): - frame_data = np.random.randn(10, 10) - df = pandas.DataFrame(frame_data) - df.values[:, ::2] = np.nan - ray_df = pd.DataFrame(df) - - assert ray_df_equals_pandas(ray_df.fillna(method='ffill', axis=1), - df.fillna(method='ffill', axis=1)) - - df.insert(6, 'foo', 5) - ray_df = pd.DataFrame(df) - assert ray_df_equals_pandas(ray_df.fillna(method='ffill', axis=1), - df.fillna(method='ffill', axis=1)) - - -@pytest.fixture -def test_fillna_invalid_method(): - test_data = TestData() - ray_df = pd.DataFrame(test_data.frame) - with tm.assert_raises_regex(ValueError, 'ffil'): - ray_df.fillna(method='ffil') - - -@pytest.fixture -def test_fillna_invalid_value(): - test_data = TestData() - ray_df = pd.DataFrame(test_data.frame) - # list - pytest.raises(TypeError, ray_df.fillna, [1, 2]) - # tuple - pytest.raises(TypeError, ray_df.fillna, (1, 2)) - # TODO: Uncomment when iloc is implemented - # frame with series - # pytest.raises(ValueError, ray_df.iloc[:, 0].fillna, ray_df) - - -@pytest.fixture -def test_fillna_col_reordering(): - cols = ["COL." + str(i) for i in range(5, 0, -1)] - data = np.random.rand(20, 5) - df = pandas.DataFrame(index=range(20), columns=cols, data=data) - ray_df = pd.DataFrame(index=range(20), columns=cols, data=data) - assert ray_df_equals_pandas(ray_df.fillna(method='ffill'), - df.fillna(method='ffill')) - - -""" -TODO: Use this when Arrow issue resolves: -(https://issues.apache.org/jira/browse/ARROW-2122) -@pytest.fixture -def test_fillna_datetime_columns(): - frame_data = {'A': [-1, -2, np.nan], - 'B': date_range('20130101', periods=3), - 'C': ['foo', 'bar', None], - 'D': ['foo2', 'bar2', None]} - df = pandas.DataFrame(frame_data, index=date_range('20130110', periods=3)) - ray_df = pd.DataFrame(frame_data, index=date_range('20130110', periods=3)) - assert ray_df_equals_pandas(ray_df.fillna('?'), df.fillna('?')) - - frame_data = {'A': [-1, -2, np.nan], - 'B': [pandas.Timestamp('2013-01-01'), - pandas.Timestamp('2013-01-02'), pandas.NaT], - 'C': ['foo', 'bar', None], - 'D': ['foo2', 'bar2', None]} - df = pandas.DataFrame(frame_data, index=date_range('20130110', periods=3)) - ray_df = pd.DataFrame(frame_data, index=date_range('20130110', periods=3)) - assert ray_df_equals_pandas(ray_df.fillna('?'), df.fillna('?')) -""" - - -@pytest.fixture -def test_filter(ray_df, pandas_df, by): - assert ray_df_equals_pandas(ray_df.filter(items=by['items']), - pandas_df.filter(items=by['items'])) - - assert ray_df_equals_pandas(ray_df.filter(regex=by['regex']), - pandas_df.filter(regex=by['regex'])) - - assert ray_df_equals_pandas(ray_df.filter(like=by['like']), - pandas_df.filter(like=by['like'])) - - -def test_first(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.first(None) - - -@pytest.fixture -def test_first_valid_index(ray_df, pandas_df): - assert ray_df.first_valid_index() == (pandas_df.first_valid_index()) - - -def test_floordiv(): - test_inter_df_math("floordiv", simple=False) - - -def test_from_csv(): - with pytest.raises(NotImplementedError): - pd.DataFrame.from_csv(None) - - -def test_from_dict(): - with pytest.raises(NotImplementedError): - pd.DataFrame.from_dict(None) - - -def test_from_items(): - with pytest.raises(NotImplementedError): - pd.DataFrame.from_items(None) - - -def test_from_records(): - with pytest.raises(NotImplementedError): - pd.DataFrame.from_records(None) - - -def test_ge(): - test_comparison_inter_ops("ge") - - -def test_get_value(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.get_value(None, None) - - -def test_get_values(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.get_values() - - -def test_gt(): - test_comparison_inter_ops("gt") - - -@pytest.fixture -def test_head(ray_df, pandas_df, n=5): - assert ray_df_equals_pandas(ray_df.head(n), pandas_df.head(n)) - - -def test_hist(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.hist(None) - - -@pytest.fixture -def test_idxmax(ray_df, pandas_df): - assert \ - ray_df.idxmax().equals(pandas_df.idxmax()) - - -@pytest.fixture -def test_idxmin(ray_df, pandas_df): - assert \ - ray_df.idxmin().equals(pandas_df.idxmin()) - - -def test_infer_objects(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.infer_objects() - - -@pytest.fixture -def test_info(ray_df): - info_string = ray_df.info() - assert '\n' in info_string - info_string = ray_df.info(memory_usage=True) - assert 'memory_usage: ' in info_string - - -@pytest.fixture -def test_insert(ray_df, pandas_df, loc, column, value): - ray_df_cp = ray_df.copy() - pd_df_cp = pandas_df.copy() - - ray_df_cp.insert(loc, column, value) - pd_df_cp.insert(loc, column, value) - - -def test_interpolate(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.interpolate() - - -@pytest.fixture -def test_items(ray_df, pandas_df): - ray_items = ray_df.items() - pandas_items = pandas_df.items() - for ray_item, pandas_item in zip(ray_items, pandas_items): - ray_index, ray_series = ray_item - pandas_index, pandas_series = pandas_item - assert pandas_series.equals(ray_series) - assert pandas_index == ray_index - - -@pytest.fixture -def test_iteritems(ray_df, pandas_df): - ray_items = ray_df.iteritems() - pandas_items = pandas_df.iteritems() - for ray_item, pandas_item in zip(ray_items, pandas_items): - ray_index, ray_series = ray_item - pandas_index, pandas_series = pandas_item - assert pandas_series.equals(ray_series) - assert pandas_index == ray_index - - -@pytest.fixture -def test_iterrows(ray_df, pandas_df): - ray_iterrows = ray_df.iterrows() - pandas_iterrows = pandas_df.iterrows() - for ray_row, pandas_row in zip(ray_iterrows, pandas_iterrows): - ray_index, ray_series = ray_row - pandas_index, pandas_series = pandas_row - assert pandas_series.equals(ray_series) - assert pandas_index == ray_index - - -@pytest.fixture -def test_itertuples(ray_df, pandas_df): - # test default - ray_it_default = ray_df.itertuples() - pandas_it_default = pandas_df.itertuples() - for ray_row, pandas_row in zip(ray_it_default, pandas_it_default): - np.testing.assert_equal(ray_row, pandas_row) - - # test all combinations of custom params - indices = [True, False] - names = [None, 'NotPandas', 'Pandas'] - - for index in indices: - for name in names: - ray_it_custom = ray_df.itertuples(index=index, name=name) - pandas_it_custom = pandas_df.itertuples(index=index, name=name) - for ray_row, pandas_row in zip(ray_it_custom, pandas_it_custom): - np.testing.assert_equal(ray_row, pandas_row) - - -def test_join(): - frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} - - ray_df = pd.DataFrame(frame_data) - pandas_df = pandas.DataFrame(frame_data) - - frame_data2 = {"col5": [0], "col6": [1]} - ray_df2 = pd.DataFrame(frame_data2) - pandas_df2 = pandas.DataFrame(frame_data2) - - join_types = ["left", "right", "outer", "inner"] - for how in join_types: - ray_join = ray_df.join(ray_df2, how=how) - pandas_join = pandas_df.join(pandas_df2, how=how) - assert ray_df_equals_pandas(ray_join, pandas_join) - - frame_data3 = {"col7": [1, 2, 3, 5, 6, 7, 8]} - - ray_df3 = pd.DataFrame(frame_data3) - pandas_df3 = pandas.DataFrame(frame_data3) - - join_types = ["left", "outer", "inner"] - for how in join_types: - ray_join = ray_df.join([ray_df2, ray_df3], how=how) - pandas_join = pandas_df.join([pandas_df2, pandas_df3], how=how) - assert ray_df_equals_pandas(ray_join, pandas_join) - - -def test_kurt(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.kurt() - - -def test_kurtosis(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.kurtosis() - - -def test_last(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.last(None) - - -@pytest.fixture -def test_last_valid_index(ray_df, pandas_df): - assert ray_df.last_valid_index() == (pandas_df.last_valid_index()) - - -def test_le(): - test_comparison_inter_ops("le") - - -def test_lookup(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.lookup(None, None) - - -def test_lt(): - test_comparison_inter_ops("lt") - - -def test_mad(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.mad() - - -def test_mask(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.mask(None) - - -@pytest.fixture -def test_max(ray_df, pandas_df): - assert ray_series_equals_pandas(ray_df.max(), pandas_df.max()) - assert ray_series_equals_pandas(ray_df.max(axis=1), pandas_df.max(axis=1)) - - -@pytest.fixture -def test_mean(ray_df, pandas_df): - assert ray_df.mean().equals(pandas_df.mean()) - - -@pytest.fixture -def test_median(ray_df, pandas_df): - assert ray_df.median().equals(pandas_df.median()) - - -def test_melt(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.melt() - - -@pytest.fixture -def test_memory_usage(ray_df): - assert type(ray_df.memory_usage()) is pandas.core.series.Series - assert ray_df.memory_usage(index=True).at['Index'] is not None - assert ray_df.memory_usage(deep=True).sum() >= \ - ray_df.memory_usage(deep=False).sum() - - -def test_merge(): - frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]} - - ray_df = pd.DataFrame(frame_data) - pandas_df = pandas.DataFrame(frame_data) - - frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]} - ray_df2 = pd.DataFrame(frame_data2) - pandas_df2 = pandas.DataFrame(frame_data2) - - join_types = ["outer", "inner"] - for how in join_types: - # Defaults - ray_result = ray_df.merge(ray_df2, how=how) - pandas_result = pandas_df.merge(pandas_df2, how=how) - ray_df_equals_pandas(ray_result, pandas_result) - - # left_on and right_index - ray_result = ray_df.merge(ray_df2, how=how, left_on='col1', - right_index=True) - pandas_result = pandas_df.merge(pandas_df2, how=how, left_on='col1', - right_index=True) - ray_df_equals_pandas(ray_result, pandas_result) - - # left_index and right_index - ray_result = ray_df.merge(ray_df2, how=how, left_index=True, - right_index=True) - pandas_result = pandas_df.merge(pandas_df2, how=how, left_index=True, - right_index=True) - ray_df_equals_pandas(ray_result, pandas_result) - - # left_index and right_on - ray_result = ray_df.merge(ray_df2, how=how, left_index=True, - right_on='col1') - pandas_result = pandas_df.merge(pandas_df2, how=how, left_index=True, - right_on='col1') - ray_df_equals_pandas(ray_result, pandas_result) - - # left_on and right_on col1 - ray_result = ray_df.merge(ray_df2, how=how, left_on='col1', - right_on='col1') - pandas_result = pandas_df.merge(pandas_df2, how=how, left_on='col1', - right_on='col1') - ray_df_equals_pandas(ray_result, pandas_result) - - # left_on and right_on col2 - ray_result = ray_df.merge(ray_df2, how=how, left_on='col2', - right_on='col2') - pandas_result = pandas_df.merge(pandas_df2, how=how, left_on='col2', - right_on='col2') - ray_df_equals_pandas(ray_result, pandas_result) - - -@pytest.fixture -def test_min(ray_df, pandas_df): - assert ray_series_equals_pandas(ray_df.min(), pandas_df.min()) - assert ray_series_equals_pandas(ray_df.min(axis=1), pandas_df.min(axis=1)) - - -def test_mod(): - test_inter_df_math("mod", simple=False) - - -@pytest.fixture -def test_mode(ray_df, pandas_df): - assert ray_series_equals_pandas(ray_df.mode(), pandas_df.mode()) - assert ray_series_equals_pandas(ray_df.mode(axis=1), - pandas_df.mode(axis=1)) - - -def test_mul(): - test_inter_df_math("mul", simple=False) - - -def test_multiply(): - test_inter_df_math("multiply", simple=False) - - -def test_ne(): - test_comparison_inter_ops("ne") - - -def test_nlargest(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.nlargest(None, None) - - -@pytest.fixture -def test_notna(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.notna(), pandas_df.notna()) - - -@pytest.fixture -def test_notnull(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.notnull(), pandas_df.notnull()) - - -def test_nsmallest(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.nsmallest(None, None) - - -@pytest.fixture -def test_nunique(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.nunique(), pandas_df.nunique()) - assert ray_df_equals_pandas(ray_df.nunique(axis=1), - pandas_df.nunique(axis=1)) - - -def test_pct_change(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.pct_change() - - -@pytest.fixture -def test_pipe(ray_df, pandas_df): - n = len(ray_df.index) - a, b, c = 2 % n, 0, 3 % n - col = ray_df.columns[3 % len(ray_df.columns)] - - def h(x): - return x.drop(columns=[col]) - - def g(x, arg1=0): - for _ in range(arg1): - x = x.append(x) - return x - - def f(x, arg2=0, arg3=0): - return x.drop([arg2, arg3]) - - assert ray_df_equals(f(g(h(ray_df), arg1=a), arg2=b, arg3=c), - (ray_df.pipe(h) - .pipe(g, arg1=a) - .pipe(f, arg2=b, arg3=c))) - - assert ray_df_equals_pandas((ray_df.pipe(h) - .pipe(g, arg1=a) - .pipe(f, arg2=b, arg3=c)), - (pandas_df.pipe(h) - .pipe(g, arg1=a) - .pipe(f, arg2=b, arg3=c))) - - -def test_pivot(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.pivot() - - -def test_pivot_table(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.pivot_table() - - -def test_plot(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.plot() - - -@pytest.fixture -def test_pop(ray_df, pandas_df): - temp_ray_df = ray_df.copy() - temp_pandas_df = pandas_df.copy() - ray_popped = temp_ray_df.pop('col2') - pandas_popped = temp_pandas_df.pop('col2') - assert ray_popped.equals(pandas_popped) - assert ray_df_equals_pandas(temp_ray_df, temp_pandas_df) - - -def test_pow(): - test_inter_df_math("pow", simple=False) - - -@pytest.fixture -def test_prod(ray_df, pandas_df): - assert ray_df.prod().equals(pandas_df.prod()) - - -@pytest.fixture -def test_product(ray_df, pandas_df): - assert ray_df.product().equals(pandas_df.product()) - - -@pytest.fixture -def test_quantile(ray_df, pandas_df, q): - assert ray_df.quantile(q).equals(pandas_df.quantile(q)) - - -@pytest.fixture -def test_query(ray_df, pandas_df, funcs): - for f in funcs: - pandas_df_new, ray_df_new = pandas_df.query(f), ray_df.query(f) - assert pandas_df_new.equals(to_pandas(ray_df_new)) - - -def test_radd(): - test_inter_df_math_right_ops("radd") - - -@pytest.fixture -def test_rank(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.rank(), pandas_df.rank()) - assert ray_df_equals_pandas(ray_df.rank(axis=1), pandas_df.rank(axis=1)) - - -def test_rdiv(): - test_inter_df_math_right_ops("rdiv") - - -def test_reindex(): - frame_data = {'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]} - pandas_df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - - assert ray_df_equals_pandas( - ray_df.reindex([0, 3, 2, 1]), pandas_df.reindex([0, 3, 2, 1])) - - assert ray_df_equals_pandas( - ray_df.reindex([0, 6, 2]), pandas_df.reindex([0, 6, 2])) - - assert ray_df_equals_pandas( - ray_df.reindex(['col1', 'col3', 'col4', 'col2'], axis=1), - pandas_df.reindex(['col1', 'col3', 'col4', 'col2'], axis=1)) - - assert ray_df_equals_pandas( - ray_df.reindex(['col1', 'col7', 'col4', 'col8'], axis=1), - pandas_df.reindex(['col1', 'col7', 'col4', 'col8'], axis=1)) - - assert ray_df_equals_pandas( - ray_df.reindex(index=[0, 1, 5], - columns=['col1', 'col7', 'col4', 'col8']), - pandas_df.reindex(index=[0, 1, 5], - columns=['col1', 'col7', 'col4', 'col8'])) - - -def test_reindex_axis(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.reindex_axis(None) - - -def test_reindex_like(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.reindex_like(None) - - -# Renaming - -def test_rename(): - test_rename_sanity() - test_rename_multiindex() - # TODO: Uncomment when __setitem__ is implemented - # test_rename_nocopy() - test_rename_inplace() - test_rename_bug() - - -@pytest.fixture -def test_rename_sanity(): - test_data = TestData() - mapping = {'A': 'a', 'B': 'b', 'C': 'c', 'D': 'd'} - - ray_df = pd.DataFrame(test_data.frame) - assert ray_df_equals_pandas(ray_df.rename(columns=mapping), - test_data.frame.rename(columns=mapping)) - - renamed2 = test_data.frame.rename(columns=str.lower) - assert ray_df_equals_pandas(ray_df.rename(columns=str.lower), renamed2) - - ray_df = pd.DataFrame(renamed2) - assert ray_df_equals_pandas(ray_df.rename(columns=str.upper), - renamed2.rename(columns=str.upper)) - - # index - data = {'A': {'foo': 0, 'bar': 1}} - - # gets sorted alphabetical - df = pandas.DataFrame(data) - ray_df = pd.DataFrame(data) - tm.assert_index_equal( - ray_df.rename(index={'foo': 'bar', 'bar': 'foo'}).index, - df.rename(index={'foo': 'bar', 'bar': 'foo'}).index) - - tm.assert_index_equal(ray_df.rename(index=str.upper).index, - df.rename(index=str.upper).index) - - # have to pass something - pytest.raises(TypeError, ray_df.rename) - - # partial columns - renamed = test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'}) - ray_df = pd.DataFrame(test_data.frame) - tm.assert_index_equal( - ray_df.rename(columns={'C': 'foo', 'D': 'bar'}).index, - test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'}).index) - - # TODO: Uncomment when transpose works - # other axis - # renamed = test_data.frame.T.rename(index={'C': 'foo', 'D': 'bar'}) - # tm.assert_index_equal( - # test_data.frame.T.rename(index={'C': 'foo', 'D': 'bar'}).index, - # ray_df.T.rename(index={'C': 'foo', 'D': 'bar'}).index) - - # index with name - index = pandas.Index(['foo', 'bar'], name='name') - renamer = pandas.DataFrame(data, index=index) - ray_df = pd.DataFrame(data, index=index) - - renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'}) - ray_renamed = ray_df.rename(index={'foo': 'bar', 'bar': 'foo'}) - tm.assert_index_equal(renamed.index, ray_renamed.index) - - assert renamed.index.name == ray_renamed.index.name - - -@pytest.fixture -def test_rename_multiindex(): - tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')] - tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')] - index = pandas.MultiIndex.from_tuples(tuples_index, names=['foo', 'bar']) - columns = pandas.MultiIndex.from_tuples( - tuples_columns, names=['fizz', 'buzz']) - - frame_data = [(0, 0), (1, 1)] - df = pandas.DataFrame(frame_data, index=index, columns=columns) - ray_df = pd.DataFrame(frame_data, index=index, columns=columns) - - # - # without specifying level -> accross all levels - renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, - columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) - ray_renamed = ray_df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, - columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) - tm.assert_index_equal(renamed.index, ray_renamed.index) - - renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, - columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) - tm.assert_index_equal(renamed.columns, ray_renamed.columns) - assert renamed.index.names == ray_renamed.index.names - assert renamed.columns.names == ray_renamed.columns.names - - # - # with specifying a level - - # dict - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level=0) - ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level=0) - tm.assert_index_equal(renamed.columns, ray_renamed.columns) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level='fizz') - ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level='fizz') - tm.assert_index_equal(renamed.columns, ray_renamed.columns) - - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, level=1) - ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level=1) - tm.assert_index_equal(renamed.columns, ray_renamed.columns) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level='buzz') - ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level='buzz') - tm.assert_index_equal(renamed.columns, ray_renamed.columns) - - # function - func = str.upper - renamed = df.rename(columns=func, level=0) - ray_renamed = ray_df.rename(columns=func, level=0) - tm.assert_index_equal(renamed.columns, ray_renamed.columns) - renamed = df.rename(columns=func, level='fizz') - ray_renamed = ray_df.rename(columns=func, level='fizz') - tm.assert_index_equal(renamed.columns, ray_renamed.columns) - - renamed = df.rename(columns=func, level=1) - ray_renamed = ray_df.rename(columns=func, level=1) - tm.assert_index_equal(renamed.columns, ray_renamed.columns) - renamed = df.rename(columns=func, level='buzz') - ray_renamed = ray_df.rename(columns=func, level='buzz') - tm.assert_index_equal(renamed.columns, ray_renamed.columns) - - # index - renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, level=0) - ray_renamed = ray_df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, - level=0) - tm.assert_index_equal(ray_renamed.index, renamed.index) - - -@pytest.fixture -def test_rename_nocopy(): - test_data = TestData().frame - ray_df = pd.DataFrame(test_data) - ray_renamed = ray_df.rename(columns={'C': 'foo'}, copy=False) - ray_renamed['foo'] = 1 - assert (ray_df['C'] == 1).all() - - -@pytest.fixture -def test_rename_inplace(): - test_data = TestData().frame - ray_df = pd.DataFrame(test_data) - - assert ray_df_equals_pandas(ray_df.rename(columns={'C': 'foo'}), - test_data.rename(columns={'C': 'foo'})) - - frame = test_data.copy() - ray_frame = ray_df.copy() - frame.rename(columns={'C': 'foo'}, inplace=True) - ray_frame.rename(columns={'C': 'foo'}, inplace=True) - - assert ray_df_equals_pandas(ray_frame, frame) - - -@pytest.fixture -def test_rename_bug(): - # rename set ref_locs, and set_index was not resetting - frame_data = {0: ['foo', 'bar'], 1: ['bah', 'bas'], 2: [1, 2]} - df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - df = df.rename(columns={0: 'a'}) - df = df.rename(columns={1: 'b'}) - # TODO: Uncomment when set_index is implemented - # df = df.set_index(['a', 'b']) - # df.columns = ['2001-01-01'] - - ray_df = ray_df.rename(columns={0: 'a'}) - ray_df = ray_df.rename(columns={1: 'b'}) - # TODO: Uncomment when set_index is implemented - # ray_df = ray_df.set_index(['a', 'b']) - # ray_df.columns = ['2001-01-01'] - - assert ray_df_equals_pandas(ray_df, df) - - -def test_rename_axis_inplace(): - test_frame = TestData().frame - ray_df = pd.DataFrame(test_frame) - - result = test_frame.copy() - ray_result = ray_df.copy() - no_return = result.rename_axis('foo', inplace=True) - ray_no_return = ray_result.rename_axis('foo', inplace=True) - - assert no_return is ray_no_return - assert ray_df_equals_pandas(ray_result, result) - - result = test_frame.copy() - ray_result = ray_df.copy() - no_return = result.rename_axis('bar', axis=1, inplace=True) - ray_no_return = ray_result.rename_axis('bar', axis=1, inplace=True) - - assert no_return is ray_no_return - assert ray_df_equals_pandas(ray_result, result) - - -def test_reorder_levels(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.reorder_levels(None) - - -def test_replace(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.replace() - - -def test_resample(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.resample(None) - - -@pytest.fixture -def test_reset_index(ray_df, pandas_df, inplace=False): - if not inplace: - assert to_pandas(ray_df.reset_index(inplace=inplace)).equals( - pandas_df.reset_index(inplace=inplace)) - else: - ray_df_cp = ray_df.copy() - pd_df_cp = pandas_df.copy() - ray_df_cp.reset_index(inplace=inplace) - pd_df_cp.reset_index(inplace=inplace) - assert to_pandas(ray_df_cp).equals(pd_df_cp) - - -@pytest.mark.skip(reason="dtypes on different partitions may not match up, " - "no fix for this yet") -def test_rfloordiv(): - test_inter_df_math_right_ops("rfloordiv") - - -@pytest.mark.skip(reason="dtypes on different partitions may not match up, " - "no fix for this yet") -def test_rmod(): - test_inter_df_math_right_ops("rmod") - - -def test_rmul(): - test_inter_df_math_right_ops("rmul") - - -def test_rolling(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.rolling(None) - - -@pytest.fixture -def test_round(ray_df, pd_df): - assert ray_df_equals_pandas(ray_df.round(), pd_df.round()) - assert ray_df_equals_pandas(ray_df.round(1), pd_df.round(1)) - - -def test_rpow(): - test_inter_df_math_right_ops("rpow") - - -def test_rsub(): - test_inter_df_math_right_ops("rsub") - - -@pytest.mark.skip(reason="dtypes on different partitions may not match up, " - "no fix for this yet") -def test_rtruediv(): - test_inter_df_math_right_ops("rtruediv") - - -def test_sample(): - ray_df = create_test_dataframe() - assert len(ray_df.sample(n=4)) == 4 - assert len(ray_df.sample(frac=0.5)) == 2 - - -def test_select(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.select(None) - - -def test_select_dtypes(): - frame_data = {'test1': list('abc'), - 'test2': np.arange(3, 6).astype('u1'), - 'test3': np.arange(8.0, 11.0, dtype='float64'), - 'test4': [True, False, True], - 'test5': pandas.date_range('now', periods=3).values, - 'test6': list(range(5, 8))} - df = pandas.DataFrame(frame_data) - rd = pd.DataFrame(frame_data) - - include = np.float, 'integer' - exclude = np.bool_, - r = rd.select_dtypes(include=include, exclude=exclude) - - e = df[["test2", "test3", "test6"]] - assert ray_df_equals_pandas(r, e) - - try: - pd.DataFrame().select_dtypes() - assert False - except ValueError: - assert True - - -def test_sem(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.sem() - - -@pytest.fixture -def test_set_axis(ray_df, pandas_df, label, axis): - assert to_pandas(ray_df.set_axis(label, axis, inplace=False)).equals( - pandas_df.set_axis(label, axis, inplace=False)) - - -@pytest.fixture -def test_set_index(ray_df, pandas_df, keys, inplace=False): - if not inplace: - assert to_pandas(ray_df.set_index(keys)).equals( - pandas_df.set_index(keys)) - else: - ray_df_cp = ray_df.copy() - pd_df_cp = pandas_df.copy() - ray_df_cp.set_index(keys, inplace=inplace) - pd_df_cp.set_index(keys, inplace=inplace) - assert to_pandas(ray_df_cp).equals(pd_df_cp) - - -def test_set_value(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.set_value(None, None, None) - - -def test_shift(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.shift() - - -@pytest.fixture -def test_skew(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.skew(), pandas_df.skew()) - assert ray_df_equals_pandas(ray_df.skew(axis=1), - pandas_df.skew(axis=1)) - - -def test_slice_shift(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.slice_shift() - - -def test_sort_index(): - frame_data = np.random.randint(0, 100, size=(1000, 100)) - pandas_df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - - pandas_result = pandas_df - ray_result = ray_df - - assert ray_df_equals_pandas(ray_result, pandas_result) - - pandas_result = pandas_df.sort_index(ascending=False) - ray_result = ray_df.sort_index(ascending=False) - - assert ray_df_equals_pandas(ray_result, pandas_result) - - -def test_sort_values(): - frame_data = np.random.randint(0, 100, size=(1000, 100)) - pandas_df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - - pandas_result = pandas_df.sort_values(by=1) - ray_result = ray_df.sort_values(by=1) - - assert ray_df_equals_pandas(ray_result, pandas_result) - - pandas_result = pandas_df.sort_values(by=1, axis=1) - ray_result = ray_df.sort_values(by=1, axis=1) - - assert ray_df_equals_pandas(ray_result, pandas_result) - - pandas_result = pandas_df.sort_values(by=[1, 3]) - ray_result = ray_df.sort_values(by=[1, 3]) - - assert ray_df_equals_pandas(ray_result, pandas_result) - - pandas_result = pandas_df.sort_values(by=[1, 67], axis=1) - ray_result = ray_df.sort_values(by=[1, 67], axis=1) - - assert ray_df_equals_pandas(ray_result, pandas_result) - - -def test_sortlevel(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.sortlevel() - - -def test_squeeze(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.squeeze() - - -def test_stack(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.stack() - - -@pytest.fixture -def test_std(ray_df, pandas_df): - assert ray_df.std().equals(pandas_df.std()) - - -def test_sub(): - test_inter_df_math("sub", simple=False) - - -def test_subtract(): - test_inter_df_math("subtract", simple=False) - - -def test_swapaxes(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.swapaxes(None, None) - - -def test_swaplevel(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.swaplevel() - - -@pytest.fixture -def test_tail(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.tail(), pandas_df.tail()) - - -def test_take(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.take(None) - - -def test_to_records(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.to_records() - - -def test_to_sparse(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.to_sparse() - - -def test_to_string(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.to_string() - - -def test_to_timestamp(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.to_timestamp() - - -def test_to_xarray(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.to_xarray() - - -@pytest.fixture -def test_transform(ray_df, pandas_df): - assert ray_df_equals_pandas(ray_df.transform(lambda df: df.isna()), - pandas_df.transform(lambda df: df.isna())) - assert ray_df_equals_pandas(ray_df.transform('isna'), - pandas_df.transform('isna')) - - -def test_truediv(): - test_inter_df_math("truediv", simple=False) - - -def test_truncate(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.truncate() - - -def test_tshift(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.tshift() - - -def test_tz_convert(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.tz_convert(None) - - -def test_tz_localize(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.tz_localize(None) - - -def test_unstack(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.unstack() - - -def test_update(): - df = pd.DataFrame([[1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3], - [1.5, np.nan, 3]]) - - other = pd.DataFrame([[3.6, 2., np.nan], - [np.nan, np.nan, 7]], index=[1, 3]) - - df.update(other) - - expected = pd.DataFrame([[1.5, np.nan, 3], - [3.6, 2, 3], - [1.5, np.nan, 3], - [1.5, np.nan, 7.]]) - assert ray_df_equals(df, expected) - - -@pytest.fixture -def test_var(ray_df, pandas_df): - assert ray_df.var().equals(pandas_df.var()) - - -def test_where(): - frame_data = np.random.randn(100, 10) - pandas_df = pandas.DataFrame(frame_data, columns=list('abcdefghij')) - ray_df = pd.DataFrame(frame_data, columns=list('abcdefghij')) - - pandas_cond_df = pandas_df % 5 < 2 - ray_cond_df = ray_df % 5 < 2 - - pandas_result = pandas_df.where(pandas_cond_df, -pandas_df) - ray_result = ray_df.where(ray_cond_df, -ray_df) - - assert ray_df_equals_pandas(ray_result, pandas_result) - - other = pandas_df.loc[3] - - pandas_result = pandas_df.where(pandas_cond_df, other, axis=1) - ray_result = ray_df.where(ray_cond_df, other, axis=1) - - assert ray_df_equals_pandas(ray_result, pandas_result) - - other = pandas_df['e'] - - pandas_result = pandas_df.where(pandas_cond_df, other, axis=0) - ray_result = ray_df.where(ray_cond_df, other, axis=0) - - assert ray_df_equals_pandas(ray_result, pandas_result) - - pandas_result = pandas_df.where(pandas_df < 2, True) - ray_result = ray_df.where(ray_df < 2, True) - - assert ray_df_equals_pandas(ray_result, pandas_result) - - -def test_xs(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.xs(None) - - -@pytest.fixture -def test___getitem__(ray_df, pd_df): - ray_col = ray_df.__getitem__('col1') - assert isinstance(ray_col, pandas.Series) - - pd_col = pd_df['col1'] - assert pd_col.equals(ray_col) - - -def test___getattr__(): - df = create_test_dataframe() - - col = df.__getattr__("col1") - assert isinstance(col, pandas.Series) - - col = getattr(df, "col1") - assert isinstance(col, pandas.Series) - - col = df.col1 - assert isinstance(col, pandas.Series) - - # Check that lookup in column doesn't override other attributes - df2 = df.rename(index=str, columns={"col5": "columns"}) - assert isinstance(df2.columns, pandas.Index) - - -def test___setitem__(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.__setitem__(None, None) - - -@pytest.fixture -def test___len__(ray_df, pandas_df): - assert len(ray_df) == len(pandas_df) - - -def test___unicode__(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.__unicode__() - - -@pytest.fixture -def test___neg__(ray_df, pd_df): - ray_df_neg = ray_df.__neg__() - assert pd_df.__neg__().equals(to_pandas(ray_df_neg)) - - -def test___invert__(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.__invert__() - - -def test___hash__(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.__hash__() - - -@pytest.fixture -def test___iter__(ray_df, pd_df): - ray_iterator = ray_df.__iter__() - - # Check that ray_iterator implements the iterator interface - assert hasattr(ray_iterator, '__iter__') - assert hasattr(ray_iterator, 'next') or hasattr(ray_iterator, '__next__') - - pd_iterator = pd_df.__iter__() - assert list(ray_iterator) == list(pd_iterator) - - -@pytest.fixture -def test___contains__(ray_df, key, result): - assert result == ray_df.__contains__(key) - assert result == (key in ray_df) - - -def test___nonzero__(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.__nonzero__() - - -def test___bool__(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.__bool__() - - -@pytest.fixture -def test___abs__(ray_df, pandas_df): - assert ray_df_equals_pandas(abs(ray_df), abs(pandas_df)) - - -def test___round__(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.__round__() - - -@pytest.fixture -def test___array__(ray_df, pandas_df): - assert np.array_equal(ray_df.__array__(), pandas_df.__array__()) - - -def test___getstate__(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.__getstate__() - - -def test___setstate__(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.__setstate__(None) - - -@pytest.fixture -def test___delitem__(ray_df, pd_df): - ray_df = ray_df.copy() - pd_df = pd_df.copy() - ray_df.__delitem__('col1') - pd_df.__delitem__('col1') - assert ray_df_equals_pandas(ray_df, pd_df) - - # Issue 2027 - last_label = pd_df.iloc[:, -1].name - ray_df.__delitem__(last_label) - pd_df.__delitem__(last_label) - ray_df_equals_pandas(ray_df, pd_df) - - -def test___finalize__(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.__finalize__(None) - - -@pytest.fixture -def test___copy__(ray_df, pd_df): - ray_df_copy, pd_df_copy = ray_df.__copy__(), pd_df.__copy__() - assert ray_df_equals_pandas(ray_df_copy, pd_df_copy) - - -@pytest.fixture -def test___deepcopy__(ray_df, pd_df): - ray_df_copy, pd_df_copy = ray_df.__deepcopy__(), pd_df.__deepcopy__() - assert ray_df_equals_pandas(ray_df_copy, pd_df_copy) - - -def test_blocks(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.blocks - - -def test_style(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.style - - -def test_iat(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.iat() - - -def test___rsub__(): - test_inter_df_math_right_ops("__rsub__") - - -@pytest.fixture -def test_loc(ray_df, pd_df): - # Singleton - assert ray_df.loc[0].equals(pd_df.loc[0]) - assert ray_df.loc[0, 'col1'] == pd_df.loc[0, 'col1'] - - # List - assert ray_df.loc[[1, 2]].equals(pd_df.loc[[1, 2]]) - assert ray_df.loc[[1, 2], ['col1']].equals(pd_df.loc[[1, 2], ['col1']]) - - # Slice - assert ray_df.loc[1:, 'col1'].equals(pd_df.loc[1:, 'col1']) - assert ray_df.loc[1:2, 'col1'].equals(pd_df.loc[1:2, 'col1']) - assert ray_df.loc[1:2, 'col1':'col2'].equals(pd_df.loc[1:2, 'col1':'col2']) - - -def test_is_copy(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.is_copy - - -def test___div__(): - test_inter_df_math("__div__", simple=True) - - -def test_at(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.at() - - -def test_ix(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.ix() - - -@pytest.fixture -def test_iloc(ray_df, pd_df): - # Singleton - assert ray_df.iloc[0].equals(pd_df.iloc[0]) - assert ray_df.iloc[0, 1] == pd_df.iloc[0, 1] - - # List - assert ray_df.iloc[[1, 2]].equals(pd_df.iloc[[1, 2]]) - assert ray_df.iloc[[1, 2], [1, 0]].equals(pd_df.iloc[[1, 2], [1, 0]]) - - # Slice - assert ray_df.iloc[1:, 0].equals(pd_df.iloc[1:, 0]) - assert ray_df.iloc[1:2, 0].equals(pd_df.iloc[1:2, 0]) - assert ray_df.iloc[1:2, 0:2].equals(pd_df.iloc[1:2, 0:2]) - - -def test__doc__(): - assert pd.DataFrame.__doc__ != pandas.DataFrame.__doc__ - assert pd.DataFrame.__init__ != pandas.DataFrame.__init__ - for attr, obj in pd.DataFrame.__dict__.items(): - if (callable(obj) or isinstance(obj, property)) \ - and attr != "__init__": - pd_obj = getattr(pandas.DataFrame, attr, None) - if callable(pd_obj) or isinstance(pd_obj, property): - assert obj.__doc__ == pd_obj.__doc__ - - -def test_to_datetime(): - frame_data = {'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]} - ray_df = pd.DataFrame(frame_data) - pd_df = pandas.DataFrame(frame_data) - - pd.to_datetime(ray_df).equals(pandas.to_datetime(pd_df)) - - -def test_get_dummies(): - frame_data = {'A': ['a', 'b', 'a'], - 'B': ['b', 'a', 'c'], - 'C': [1, 2, 3]} - ray_df = pd.DataFrame(frame_data) - pd_df = pandas.DataFrame(frame_data) - - assert ray_df_equals_pandas(pd.get_dummies(ray_df), - pandas.get_dummies(pd_df)) diff --git a/python/ray/dataframe/test/test_groupby.py b/python/ray/dataframe/test/test_groupby.py deleted file mode 100644 index 47945eb37..000000000 --- a/python/ray/dataframe/test/test_groupby.py +++ /dev/null @@ -1,589 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import pytest -import sys -import pandas -import numpy as np -import ray.dataframe as pd -from ray.dataframe.utils import ( - from_pandas, - to_pandas) - -PY2 = False -if sys.version_info.major < 3: - PY2 = True - - -@pytest.fixture -def ray_df_equals_pandas(ray_df, pandas_df): - assert isinstance(ray_df, pd.DataFrame) - assert to_pandas(ray_df).equals(pandas_df) - - -@pytest.fixture -def ray_df_almost_equals_pandas(ray_df, pandas_df): - assert isinstance(ray_df, pd.DataFrame) - difference = to_pandas(ray_df) - pandas_df - diff_max = difference.max().max() - assert to_pandas(ray_df).equals(pandas_df) or diff_max < 0.0001 - - -@pytest.fixture -def ray_series_equals_pandas(ray_df, pandas_df): - assert ray_df.equals(pandas_df) - - -@pytest.fixture -def ray_df_equals(ray_df1, ray_df2): - assert to_pandas(ray_df1).equals(to_pandas(ray_df2)) - - -@pytest.fixture -def ray_groupby_equals_pandas(ray_groupby, pandas_groupby): - for g1, g2 in zip(ray_groupby, pandas_groupby): - assert g1[0] == g2[0] - ray_df_equals_pandas(g1[1], g2[1]) - - -def test_simple_row_groupby(): - pandas_df = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [3, 8, 12, 10], - 'col4': [17, 13, 16, 15], - 'col5': [-4, -5, -6, -7]}) - - ray_df = from_pandas(pandas_df, 2) - - by = [1, 2, 1, 2] - n = 1 - - ray_groupby = ray_df.groupby(by=by) - pandas_groupby = pandas_df.groupby(by=by) - - ray_groupby_equals_pandas(ray_groupby, pandas_groupby) - test_ngroups(ray_groupby, pandas_groupby) - test_skew(ray_groupby, pandas_groupby) - test_ffill(ray_groupby, pandas_groupby) - test_sem(ray_groupby, pandas_groupby) - test_mean(ray_groupby, pandas_groupby) - test_any(ray_groupby, pandas_groupby) - test_min(ray_groupby, pandas_groupby) - test_idxmax(ray_groupby, pandas_groupby) - test_ndim(ray_groupby, pandas_groupby) - test_cumsum(ray_groupby, pandas_groupby) - test_pct_change(ray_groupby, pandas_groupby) - test_cummax(ray_groupby, pandas_groupby) - - apply_functions = [lambda df: df.sum(), lambda df: -df] - for func in apply_functions: - test_apply(ray_groupby, pandas_groupby, func) - - test_dtypes(ray_groupby, pandas_groupby) - test_first(ray_groupby, pandas_groupby) - test_backfill(ray_groupby, pandas_groupby) - test_cummin(ray_groupby, pandas_groupby) - test_bfill(ray_groupby, pandas_groupby) - test_idxmin(ray_groupby, pandas_groupby) - test_prod(ray_groupby, pandas_groupby) - test_std(ray_groupby, pandas_groupby) - - agg_functions = ['min', 'max'] - for func in agg_functions: - test_agg(ray_groupby, pandas_groupby, func) - test_aggregate(ray_groupby, pandas_groupby, func) - - test_last(ray_groupby, pandas_groupby) - test_mad(ray_groupby, pandas_groupby) - test_rank(ray_groupby, pandas_groupby) - test_max(ray_groupby, pandas_groupby) - test_var(ray_groupby, pandas_groupby) - test_len(ray_groupby, pandas_groupby) - test_sum(ray_groupby, pandas_groupby) - test_ngroup(ray_groupby, pandas_groupby) - test_nunique(ray_groupby, pandas_groupby) - test_median(ray_groupby, pandas_groupby) - test_head(ray_groupby, pandas_groupby, n) - test_cumprod(ray_groupby, pandas_groupby) - test_cov(ray_groupby, pandas_groupby) - - transform_functions = [lambda df: df + 4, lambda df: -df - 10] - for func in transform_functions: - test_transform(ray_groupby, pandas_groupby, func) - - pipe_functions = [lambda dfgb: dfgb.sum()] - for func in pipe_functions: - test_pipe(ray_groupby, pandas_groupby, func) - - test_corr(ray_groupby, pandas_groupby) - test_fillna(ray_groupby, pandas_groupby) - test_count(ray_groupby, pandas_groupby) - test_tail(ray_groupby, pandas_groupby, n) - test_quantile(ray_groupby, pandas_groupby) - test_take(ray_groupby, pandas_groupby) - - -def test_single_group_row_groupby(): - pandas_df = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 36, 7], - 'col3': [3, 8, 12, 10], - 'col4': [17, 3, 16, 15], - 'col5': [-4, 5, -6, -7]}) - - ray_df = from_pandas(pandas_df, 2) - - by = [1, 1, 1, 1] - n = 6 - - ray_groupby = ray_df.groupby(by=by) - pandas_groupby = pandas_df.groupby(by=by) - - ray_groupby_equals_pandas(ray_groupby, pandas_groupby) - test_ngroups(ray_groupby, pandas_groupby) - test_skew(ray_groupby, pandas_groupby) - test_ffill(ray_groupby, pandas_groupby) - test_sem(ray_groupby, pandas_groupby) - test_mean(ray_groupby, pandas_groupby) - test_any(ray_groupby, pandas_groupby) - test_min(ray_groupby, pandas_groupby) - test_idxmax(ray_groupby, pandas_groupby) - test_ndim(ray_groupby, pandas_groupby) - test_cumsum(ray_groupby, pandas_groupby) - test_pct_change(ray_groupby, pandas_groupby) - test_cummax(ray_groupby, pandas_groupby) - - apply_functions = [lambda df: df.sum(), lambda df: -df] - for func in apply_functions: - test_apply(ray_groupby, pandas_groupby, func) - - test_dtypes(ray_groupby, pandas_groupby) - test_first(ray_groupby, pandas_groupby) - test_backfill(ray_groupby, pandas_groupby) - test_cummin(ray_groupby, pandas_groupby) - test_bfill(ray_groupby, pandas_groupby) - test_idxmin(ray_groupby, pandas_groupby) - test_prod(ray_groupby, pandas_groupby) - test_std(ray_groupby, pandas_groupby) - - agg_functions = ['min', 'max'] - for func in agg_functions: - test_agg(ray_groupby, pandas_groupby, func) - test_aggregate(ray_groupby, pandas_groupby, func) - - test_last(ray_groupby, pandas_groupby) - test_mad(ray_groupby, pandas_groupby) - test_rank(ray_groupby, pandas_groupby) - test_max(ray_groupby, pandas_groupby) - test_var(ray_groupby, pandas_groupby) - test_len(ray_groupby, pandas_groupby) - test_sum(ray_groupby, pandas_groupby) - test_ngroup(ray_groupby, pandas_groupby) - test_nunique(ray_groupby, pandas_groupby) - test_median(ray_groupby, pandas_groupby) - test_head(ray_groupby, pandas_groupby, n) - test_cumprod(ray_groupby, pandas_groupby) - test_cov(ray_groupby, pandas_groupby) - - transform_functions = [lambda df: df + 4, lambda df: -df - 10] - for func in transform_functions: - test_transform(ray_groupby, pandas_groupby, func) - - pipe_functions = [lambda dfgb: dfgb.sum()] - for func in pipe_functions: - test_pipe(ray_groupby, pandas_groupby, func) - - test_corr(ray_groupby, pandas_groupby) - test_fillna(ray_groupby, pandas_groupby) - test_count(ray_groupby, pandas_groupby) - test_tail(ray_groupby, pandas_groupby, n) - test_quantile(ray_groupby, pandas_groupby) - test_take(ray_groupby, pandas_groupby) - - -def test_large_row_groupby(): - pandas_df = pandas.DataFrame(np.random.randint(0, 8, size=(100, 4)), - columns=list('ABCD')) - - ray_df = from_pandas(pandas_df, 2) - - by = pandas_df['A'].tolist() - n = 4 - - ray_groupby = ray_df.groupby(by=by) - pandas_groupby = pandas_df.groupby(by=by) - - ray_groupby_equals_pandas(ray_groupby, pandas_groupby) - test_ngroups(ray_groupby, pandas_groupby) - test_skew(ray_groupby, pandas_groupby) - test_ffill(ray_groupby, pandas_groupby) - test_sem(ray_groupby, pandas_groupby) - test_mean(ray_groupby, pandas_groupby) - test_any(ray_groupby, pandas_groupby) - test_min(ray_groupby, pandas_groupby) - test_idxmax(ray_groupby, pandas_groupby) - test_ndim(ray_groupby, pandas_groupby) - test_cumsum(ray_groupby, pandas_groupby) - test_pct_change(ray_groupby, pandas_groupby) - test_cummax(ray_groupby, pandas_groupby) - - apply_functions = [lambda df: df.sum(), lambda df: -df] - for func in apply_functions: - test_apply(ray_groupby, pandas_groupby, func) - - test_dtypes(ray_groupby, pandas_groupby) - test_first(ray_groupby, pandas_groupby) - test_backfill(ray_groupby, pandas_groupby) - test_cummin(ray_groupby, pandas_groupby) - test_bfill(ray_groupby, pandas_groupby) - test_idxmin(ray_groupby, pandas_groupby) - # test_prod(ray_groupby, pandas_groupby) causes overflows - test_std(ray_groupby, pandas_groupby) - - agg_functions = ['min', 'max'] - for func in agg_functions: - test_agg(ray_groupby, pandas_groupby, func) - test_aggregate(ray_groupby, pandas_groupby, func) - - test_last(ray_groupby, pandas_groupby) - test_mad(ray_groupby, pandas_groupby) - test_rank(ray_groupby, pandas_groupby) - test_max(ray_groupby, pandas_groupby) - test_var(ray_groupby, pandas_groupby) - test_len(ray_groupby, pandas_groupby) - test_sum(ray_groupby, pandas_groupby) - test_ngroup(ray_groupby, pandas_groupby) - test_nunique(ray_groupby, pandas_groupby) - test_median(ray_groupby, pandas_groupby) - test_head(ray_groupby, pandas_groupby, n) - # test_cumprod(ray_groupby, pandas_groupby) causes overflows - test_cov(ray_groupby, pandas_groupby) - - transform_functions = [lambda df: df + 4, lambda df: -df - 10] - for func in transform_functions: - test_transform(ray_groupby, pandas_groupby, func) - - pipe_functions = [lambda dfgb: dfgb.sum()] - for func in pipe_functions: - test_pipe(ray_groupby, pandas_groupby, func) - - test_corr(ray_groupby, pandas_groupby) - test_fillna(ray_groupby, pandas_groupby) - test_count(ray_groupby, pandas_groupby) - test_tail(ray_groupby, pandas_groupby, n) - test_quantile(ray_groupby, pandas_groupby) - test_take(ray_groupby, pandas_groupby) - - -def test_simple_col_groupby(): - pandas_df = pandas.DataFrame({'col1': [0, 3, 2, 3], - 'col2': [4, 1, 6, 7], - 'col3': [3, 8, 2, 10], - 'col4': [1, 13, 6, 15], - 'col5': [-4, 5, 6, -7]}) - - ray_df = from_pandas(pandas_df, 2) - - by = [1, 2, 3, 2, 1] - - ray_groupby = ray_df.groupby(axis=1, by=by) - pandas_groupby = pandas_df.groupby(axis=1, by=by) - - ray_groupby_equals_pandas(ray_groupby, pandas_groupby) - test_ngroups(ray_groupby, pandas_groupby) - test_skew(ray_groupby, pandas_groupby) - test_ffill(ray_groupby, pandas_groupby) - test_sem(ray_groupby, pandas_groupby) - test_mean(ray_groupby, pandas_groupby) - test_any(ray_groupby, pandas_groupby) - test_min(ray_groupby, pandas_groupby) - test_ndim(ray_groupby, pandas_groupby) - - if not PY2: - # idxmax and idxmin fail on column groupby in pandas with python2 - test_idxmax(ray_groupby, pandas_groupby) - test_idxmin(ray_groupby, pandas_groupby) - test_rank(ray_groupby, pandas_groupby) - test_quantile(ray_groupby, pandas_groupby) - - # https://github.com/pandas-dev/pandas/issues/21127 - # test_cumsum(ray_groupby, pandas_groupby) - # test_cummax(ray_groupby, pandas_groupby) - # test_cummin(ray_groupby, pandas_groupby) - # test_cumprod(ray_groupby, pandas_groupby) - - test_pct_change(ray_groupby, pandas_groupby) - apply_functions = [lambda df: -df, lambda df: df.sum(axis=1)] - for func in apply_functions: - test_apply(ray_groupby, pandas_groupby, func) - - test_first(ray_groupby, pandas_groupby) - test_backfill(ray_groupby, pandas_groupby) - test_bfill(ray_groupby, pandas_groupby) - test_prod(ray_groupby, pandas_groupby) - test_std(ray_groupby, pandas_groupby) - test_last(ray_groupby, pandas_groupby) - test_mad(ray_groupby, pandas_groupby) - test_max(ray_groupby, pandas_groupby) - test_var(ray_groupby, pandas_groupby) - test_len(ray_groupby, pandas_groupby) - test_sum(ray_groupby, pandas_groupby) - - # Pandas fails on this case with ValueError - # test_ngroup(ray_groupby, pandas_groupby) - # test_nunique(ray_groupby, pandas_groupby) - test_median(ray_groupby, pandas_groupby) - test_cov(ray_groupby, pandas_groupby) - - transform_functions = [lambda df: df + 4, lambda df: -df - 10] - for func in transform_functions: - test_transform(ray_groupby, pandas_groupby, func) - - pipe_functions = [lambda dfgb: dfgb.sum()] - for func in pipe_functions: - test_pipe(ray_groupby, pandas_groupby, func) - - test_corr(ray_groupby, pandas_groupby) - test_fillna(ray_groupby, pandas_groupby) - test_count(ray_groupby, pandas_groupby) - test_take(ray_groupby, pandas_groupby) - - -@pytest.fixture -def test_ngroups(ray_groupby, pandas_groupby): - assert ray_groupby.ngroups == pandas_groupby.ngroups - - -@pytest.fixture -def test_skew(ray_groupby, pandas_groupby): - ray_df_almost_equals_pandas(ray_groupby.skew(), pandas_groupby.skew()) - - -@pytest.fixture -def test_ffill(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.ffill(), pandas_groupby.ffill()) - - -@pytest.fixture -def test_sem(ray_groupby, pandas_groupby): - with pytest.raises(NotImplementedError): - ray_groupby.sem() - - -@pytest.fixture -def test_mean(ray_groupby, pandas_groupby): - ray_df_almost_equals_pandas(ray_groupby.mean(), pandas_groupby.mean()) - - -@pytest.fixture -def test_any(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.any(), pandas_groupby.any()) - - -@pytest.fixture -def test_min(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.min(), pandas_groupby.min()) - - -@pytest.fixture -def test_idxmax(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.idxmax(), pandas_groupby.idxmax()) - - -@pytest.fixture -def test_ndim(ray_groupby, pandas_groupby): - assert ray_groupby.ndim == pandas_groupby.ndim - - -@pytest.fixture -def test_cumsum(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.cumsum(), pandas_groupby.cumsum()) - ray_df_equals_pandas(ray_groupby.cumsum(axis=1), - pandas_groupby.cumsum(axis=1)) - - -@pytest.fixture -def test_pct_change(ray_groupby, pandas_groupby): - with pytest.raises(NotImplementedError): - ray_groupby.pct_change() - - -@pytest.fixture -def test_cummax(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.cummax(), pandas_groupby.cummax()) - ray_df_equals_pandas(ray_groupby.cummax(axis=1), - pandas_groupby.cummax(axis=1)) - - -@pytest.fixture -def test_apply(ray_groupby, pandas_groupby, func): - ray_df_equals_pandas(ray_groupby.apply(func), pandas_groupby.apply(func)) - - -@pytest.fixture -def test_dtypes(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.dtypes, pandas_groupby.dtypes) - - -@pytest.fixture -def test_first(ray_groupby, pandas_groupby): - with pytest.raises(NotImplementedError): - ray_groupby.first() - - -@pytest.fixture -def test_backfill(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.backfill(), pandas_groupby.backfill()) - - -@pytest.fixture -def test_cummin(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.cummin(), pandas_groupby.cummin()) - ray_df_equals_pandas(ray_groupby.cummin(axis=1), - pandas_groupby.cummin(axis=1)) - - -@pytest.fixture -def test_bfill(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.bfill(), pandas_groupby.bfill()) - - -@pytest.fixture -def test_idxmin(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.idxmin(), pandas_groupby.idxmin()) - - -@pytest.fixture -def test_prod(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.prod(), pandas_groupby.prod()) - - -@pytest.fixture -def test_std(ray_groupby, pandas_groupby): - ray_df_almost_equals_pandas(ray_groupby.std(), pandas_groupby.std()) - - -@pytest.fixture -def test_aggregate(ray_groupby, pandas_groupby, func): - ray_df_equals_pandas(ray_groupby.aggregate(func), - pandas_groupby.aggregate(func)) - - -@pytest.fixture -def test_agg(ray_groupby, pandas_groupby, func): - ray_df_equals_pandas(ray_groupby.agg(func), pandas_groupby.agg(func)) - - -@pytest.fixture -def test_last(ray_groupby, pandas_groupby): - with pytest.raises(NotImplementedError): - ray_groupby.last() - - -@pytest.fixture -def test_mad(ray_groupby, pandas_groupby): - with pytest.raises(NotImplementedError): - ray_groupby.mad() - - -@pytest.fixture -def test_rank(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.rank(), pandas_groupby.rank()) - - -@pytest.fixture -def test_max(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.max(), pandas_groupby.max()) - - -@pytest.fixture -def test_var(ray_groupby, pandas_groupby): - ray_df_almost_equals_pandas(ray_groupby.var(), pandas_groupby.var()) - - -@pytest.fixture -def test_len(ray_groupby, pandas_groupby): - assert len(ray_groupby) == len(pandas_groupby) - - -@pytest.fixture -def test_sum(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.sum(), pandas_groupby.sum()) - - -@pytest.fixture -def test_ngroup(ray_groupby, pandas_groupby): - ray_series_equals_pandas(ray_groupby.ngroup(), pandas_groupby.ngroup()) - - -@pytest.fixture -def test_nunique(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.nunique(), pandas_groupby.nunique()) - - -@pytest.fixture -def test_median(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.median(), pandas_groupby.median()) - - -@pytest.fixture -def test_head(ray_groupby, pandas_groupby, n): - ray_df_equals_pandas(ray_groupby.head(n=n), pandas_groupby.head(n=n)) - - -@pytest.fixture -def test_cumprod(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.cumprod(), pandas_groupby.cumprod()) - ray_df_equals_pandas(ray_groupby.cumprod(axis=1), - pandas_groupby.cumprod(axis=1)) - - -@pytest.fixture -def test_cov(ray_groupby, pandas_groupby): - with pytest.raises(NotImplementedError): - ray_groupby.cov() - - -@pytest.fixture -def test_transform(ray_groupby, pandas_groupby, func): - ray_df_equals_pandas(ray_groupby.transform(func), - pandas_groupby.transform(func)) - - -@pytest.fixture -def test_corr(ray_groupby, pandas_groupby): - with pytest.raises(NotImplementedError): - ray_groupby.corr() - - -@pytest.fixture -def test_fillna(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.fillna(method="ffill"), - pandas_groupby.fillna(method="ffill")) - - -@pytest.fixture -def test_count(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.count(), pandas_groupby.count()) - - -@pytest.fixture -def test_pipe(ray_groupby, pandas_groupby, func): - ray_df_equals_pandas(ray_groupby.pipe(func), pandas_groupby.pipe(func)) - - -@pytest.fixture -def test_tail(ray_groupby, pandas_groupby, n): - ray_df_equals_pandas(ray_groupby.tail(n=n), pandas_groupby.tail(n=n)) - - -@pytest.fixture -def test_quantile(ray_groupby, pandas_groupby): - ray_df_equals_pandas(ray_groupby.quantile(q=0.4), - pandas_groupby.quantile(q=0.4)) - - -@pytest.fixture -def test_take(ray_groupby, pandas_groupby): - with pytest.raises(NotImplementedError): - ray_groupby.take(indices=[1]) diff --git a/python/ray/dataframe/test/test_io.py b/python/ray/dataframe/test/test_io.py deleted file mode 100644 index c2ab544be..000000000 --- a/python/ray/dataframe/test/test_io.py +++ /dev/null @@ -1,675 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import pytest -import numpy as np -import pandas -from ray.dataframe.utils import to_pandas -import ray.dataframe as pd -import os -import sqlite3 - -TEST_PARQUET_FILENAME = 'test.parquet' -TEST_CSV_FILENAME = 'test.csv' -TEST_JSON_FILENAME = 'test.json' -TEST_HTML_FILENAME = 'test.html' -TEST_EXCEL_FILENAME = 'test.xlsx' -TEST_FEATHER_FILENAME = 'test.feather' -TEST_HDF_FILENAME = 'test.hdf' -TEST_MSGPACK_FILENAME = 'test.msg' -TEST_STATA_FILENAME = 'test.dta' -TEST_PICKLE_FILENAME = 'test.pkl' -TEST_SAS_FILENAME = os.getcwd() + '/data/test1.sas7bdat' -TEST_SQL_FILENAME = 'test.db' -SMALL_ROW_SIZE = 2000 -LARGE_ROW_SIZE = 7e6 - - -@pytest.fixture -def ray_df_equals_pandas(ray_df, pandas_df): - return to_pandas(ray_df).sort_index().equals(pandas_df.sort_index()) - - -@pytest.fixture -def setup_parquet_file(row_size, force=False): - if os.path.exists(TEST_PARQUET_FILENAME) and not force: - pass - else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) - df.to_parquet(TEST_PARQUET_FILENAME) - - -@pytest.fixture -def create_test_ray_dataframe(): - df = pd.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]}) - - return df - - -@pytest.fixture -def create_test_pandas_dataframe(): - df = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]}) - - return df - - -@pytest.fixture -def test_files_eq(path1, path2): - with open(path1, 'rb') as file1, open(path2, 'rb') as file2: - file1_content = file1.read() - file2_content = file2.read() - - if file1_content == file2_content: - return True - else: - return False - - -@pytest.fixture -def teardown_test_file(test_path): - if os.path.exists(test_path): - os.remove(test_path) - - -@pytest.fixture -def teardown_parquet_file(): - if os.path.exists(TEST_PARQUET_FILENAME): - os.remove(TEST_PARQUET_FILENAME) - - -@pytest.fixture -def setup_csv_file(row_size, force=False, delimiter=','): - if os.path.exists(TEST_CSV_FILENAME) and not force: - pass - else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) - df.to_csv(TEST_CSV_FILENAME, sep=delimiter) - - -@pytest.fixture -def teardown_csv_file(): - if os.path.exists(TEST_CSV_FILENAME): - os.remove(TEST_CSV_FILENAME) - - -@pytest.fixture -def setup_json_file(row_size, force=False): - if os.path.exists(TEST_JSON_FILENAME) and not force: - pass - else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) - df.to_json(TEST_JSON_FILENAME) - - -@pytest.fixture -def teardown_json_file(): - if os.path.exists(TEST_JSON_FILENAME): - os.remove(TEST_JSON_FILENAME) - - -@pytest.fixture -def setup_html_file(row_size, force=False): - if os.path.exists(TEST_HTML_FILENAME) and not force: - pass - else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) - df.to_html(TEST_HTML_FILENAME) - - -@pytest.fixture -def teardown_html_file(): - if os.path.exists(TEST_HTML_FILENAME): - os.remove(TEST_HTML_FILENAME) - - -@pytest.fixture -def setup_clipboard(row_size, force=False): - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) - df.to_clipboard() - - -@pytest.fixture -def setup_excel_file(row_size, force=False): - if os.path.exists(TEST_EXCEL_FILENAME) and not force: - pass - else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) - df.to_excel(TEST_EXCEL_FILENAME) - - -@pytest.fixture -def teardown_excel_file(): - if os.path.exists(TEST_EXCEL_FILENAME): - os.remove(TEST_EXCEL_FILENAME) - - -@pytest.fixture -def setup_feather_file(row_size, force=False): - if os.path.exists(TEST_FEATHER_FILENAME) and not force: - pass - else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) - df.to_feather(TEST_FEATHER_FILENAME) - - -@pytest.fixture -def teardown_feather_file(): - if os.path.exists(TEST_FEATHER_FILENAME): - os.remove(TEST_FEATHER_FILENAME) - - -@pytest.fixture -def setup_hdf_file(row_size, force=False): - if os.path.exists(TEST_HDF_FILENAME) and not force: - pass - else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) - df.to_hdf(TEST_HDF_FILENAME, 'test') - - -@pytest.fixture -def teardown_hdf_file(): - if os.path.exists(TEST_HDF_FILENAME): - os.remove(TEST_HDF_FILENAME) - - -@pytest.fixture -def setup_msgpack_file(row_size, force=False): - if os.path.exists(TEST_MSGPACK_FILENAME) and not force: - pass - else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) - df.to_msgpack(TEST_MSGPACK_FILENAME) - - -@pytest.fixture -def teardown_msgpack_file(): - if os.path.exists(TEST_MSGPACK_FILENAME): - os.remove(TEST_MSGPACK_FILENAME) - - -@pytest.fixture -def setup_stata_file(row_size, force=False): - if os.path.exists(TEST_STATA_FILENAME) and not force: - pass - else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) - df.to_stata(TEST_STATA_FILENAME) - - -@pytest.fixture -def teardown_stata_file(): - if os.path.exists(TEST_STATA_FILENAME): - os.remove(TEST_STATA_FILENAME) - - -@pytest.fixture -def setup_pickle_file(row_size, force=False): - if os.path.exists(TEST_PICKLE_FILENAME) and not force: - pass - else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) - df.to_pickle(TEST_PICKLE_FILENAME) - - -@pytest.fixture -def teardown_pickle_file(): - if os.path.exists(TEST_PICKLE_FILENAME): - os.remove(TEST_PICKLE_FILENAME) - - -@pytest.fixture -def setup_sql_file(conn, force=False): - if os.path.exists(TEST_SQL_FILENAME) and not force: - pass - else: - df = pandas.DataFrame({'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0]}) - df.to_sql(TEST_SQL_FILENAME.split(".")[0], conn) - - -@pytest.fixture -def teardown_sql_file(): - if os.path.exists(TEST_SQL_FILENAME): - os.remove(TEST_SQL_FILENAME) - - -def test_from_parquet_small(): - - setup_parquet_file(SMALL_ROW_SIZE) - - pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME) - ray_df = pd.read_parquet(TEST_PARQUET_FILENAME) - assert ray_df_equals_pandas(ray_df, pandas_df) - - teardown_parquet_file() - - -def test_from_parquet_large(): - setup_parquet_file(LARGE_ROW_SIZE) - - pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME) - ray_df = pd.read_parquet(TEST_PARQUET_FILENAME) - - assert ray_df_equals_pandas(ray_df, pandas_df) - - teardown_parquet_file() - - -def test_from_csv(): - setup_csv_file(SMALL_ROW_SIZE) - - pandas_df = pandas.read_csv(TEST_CSV_FILENAME) - ray_df = pd.read_csv(TEST_CSV_FILENAME) - - assert ray_df_equals_pandas(ray_df, pandas_df) - - teardown_csv_file() - - -def test_from_json(): - setup_json_file(SMALL_ROW_SIZE) - - pandas_df = pandas.read_json(TEST_JSON_FILENAME) - ray_df = pd.read_json(TEST_JSON_FILENAME) - - assert ray_df_equals_pandas(ray_df, pandas_df) - - teardown_json_file() - - -def test_from_html(): - setup_html_file(SMALL_ROW_SIZE) - - pandas_df = pandas.read_html(TEST_HTML_FILENAME)[0] - ray_df = pd.read_html(TEST_HTML_FILENAME) - - assert ray_df_equals_pandas(ray_df, pandas_df) - - teardown_html_file() - - -@pytest.mark.skip(reason="No clipboard on Travis") -def test_from_clipboard(): - setup_clipboard(SMALL_ROW_SIZE) - - pandas_df = pandas.read_clipboard() - ray_df = pd.read_clipboard() - - assert ray_df_equals_pandas(ray_df, pandas_df) - - -def test_from_excel(): - setup_excel_file(SMALL_ROW_SIZE) - - pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME) - ray_df = pd.read_excel(TEST_EXCEL_FILENAME) - - assert ray_df_equals_pandas(ray_df, pandas_df) - - teardown_excel_file() - - -def test_from_feather(): - setup_feather_file(SMALL_ROW_SIZE) - - pandas_df = pandas.read_feather(TEST_FEATHER_FILENAME) - ray_df = pd.read_feather(TEST_FEATHER_FILENAME) - - assert ray_df_equals_pandas(ray_df, pandas_df) - - teardown_feather_file() - - -@pytest.mark.skip(reason="Memory overflow on Travis") -def test_from_hdf(): - setup_hdf_file(SMALL_ROW_SIZE) - - pandas_df = pandas.read_hdf(TEST_HDF_FILENAME, key='test') - ray_df = pd.read_hdf(TEST_HDF_FILENAME, key='test') - - assert ray_df_equals_pandas(ray_df, pandas_df) - - teardown_hdf_file() - - -def test_from_msgpack(): - setup_msgpack_file(SMALL_ROW_SIZE) - - pandas_df = pandas.read_msgpack(TEST_MSGPACK_FILENAME) - ray_df = pd.read_msgpack(TEST_MSGPACK_FILENAME) - - assert ray_df_equals_pandas(ray_df, pandas_df) - - teardown_msgpack_file() - - -def test_from_stata(): - setup_stata_file(SMALL_ROW_SIZE) - - pandas_df = pandas.read_stata(TEST_STATA_FILENAME) - ray_df = pd.read_stata(TEST_STATA_FILENAME) - - assert ray_df_equals_pandas(ray_df, pandas_df) - - teardown_stata_file() - - -def test_from_pickle(): - setup_pickle_file(SMALL_ROW_SIZE) - - pandas_df = pandas.read_pickle(TEST_PICKLE_FILENAME) - ray_df = pd.read_pickle(TEST_PICKLE_FILENAME) - - assert ray_df_equals_pandas(ray_df, pandas_df) - - teardown_pickle_file() - - -def test_from_sql(): - conn = sqlite3.connect(TEST_SQL_FILENAME) - setup_sql_file(conn, True) - - pandas_df = pandas.read_sql("select * from test", conn) - ray_df = pd.read_sql("select * from test", conn) - - assert ray_df_equals_pandas(ray_df, pandas_df) - - teardown_sql_file() - - -@pytest.mark.skip(reason="No SAS write methods in Pandas") -def test_from_sas(): - pandas_df = pandas.read_sas(TEST_SAS_FILENAME) - ray_df = pd.read_sas(TEST_SAS_FILENAME) - - assert ray_df_equals_pandas(ray_df, pandas_df) - - -def test_from_csv_delimiter(): - setup_csv_file(SMALL_ROW_SIZE, delimiter='|') - - pandas_df = pandas.read_csv(TEST_CSV_FILENAME) - ray_df = pd.read_csv(TEST_CSV_FILENAME) - - assert ray_df_equals_pandas(ray_df, pandas_df) - - teardown_csv_file() - - -@pytest.mark.skip(reason="No clipboard on Travis") -def test_to_clipboard(): - ray_df = create_test_ray_dataframe() - pandas_df = create_test_pandas_dataframe() - - ray_df.to_clipboard() - ray_as_clip = pandas.read_clipboard() - - pandas_df.to_clipboard() - pandas_as_clip = pandas.read_clipboard() - - assert(ray_as_clip.equals(pandas_as_clip)) - - -def test_to_csv(): - ray_df = create_test_ray_dataframe() - pandas_df = create_test_pandas_dataframe() - - TEST_CSV_DF_FILENAME = "test_df.csv" - TEST_CSV_pandas_FILENAME = "test_pandas.csv" - - ray_df.to_csv(TEST_CSV_DF_FILENAME) - pandas_df.to_csv(TEST_CSV_pandas_FILENAME) - - assert(test_files_eq(TEST_CSV_DF_FILENAME, - TEST_CSV_pandas_FILENAME)) - - teardown_test_file(TEST_CSV_pandas_FILENAME) - teardown_test_file(TEST_CSV_DF_FILENAME) - - -def test_to_dense(): - ray_df = create_test_ray_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.to_dense() - - -def test_to_dict(): - ray_df = create_test_ray_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.to_dict() - - -def test_to_excel(): - ray_df = create_test_ray_dataframe() - pandas_df = create_test_pandas_dataframe() - - TEST_EXCEL_DF_FILENAME = "test_df.xlsx" - TEST_EXCEL_pandas_FILENAME = "test_pandas.xlsx" - - ray_writer = pandas.ExcelWriter(TEST_EXCEL_DF_FILENAME) - pandas_writer = pandas.ExcelWriter(TEST_EXCEL_pandas_FILENAME) - - ray_df.to_excel(ray_writer) - pandas_df.to_excel(pandas_writer) - - ray_writer.save() - pandas_writer.save() - - assert(test_files_eq(TEST_EXCEL_DF_FILENAME, - TEST_EXCEL_pandas_FILENAME)) - - teardown_test_file(TEST_EXCEL_DF_FILENAME) - teardown_test_file(TEST_EXCEL_pandas_FILENAME) - - -def test_to_feather(): - ray_df = create_test_ray_dataframe() - pandas_df = create_test_pandas_dataframe() - - TEST_FEATHER_DF_FILENAME = "test_df.feather" - TEST_FEATHER_pandas_FILENAME = "test_pandas.feather" - - ray_df.to_feather(TEST_FEATHER_DF_FILENAME) - pandas_df.to_feather(TEST_FEATHER_pandas_FILENAME) - - assert(test_files_eq(TEST_FEATHER_DF_FILENAME, - TEST_FEATHER_pandas_FILENAME)) - - teardown_test_file(TEST_FEATHER_pandas_FILENAME) - teardown_test_file(TEST_FEATHER_DF_FILENAME) - - -def test_to_gbq(): - ray_df = create_test_ray_dataframe() - - TEST_GBQ_DF_FILENAME = "test_df.gbq" - with pytest.raises(NotImplementedError): - ray_df.to_gbq(TEST_GBQ_DF_FILENAME, None) - - -def test_to_html(): - ray_df = create_test_ray_dataframe() - pandas_df = create_test_pandas_dataframe() - - TEST_HTML_DF_FILENAME = "test_df.html" - TEST_HTML_pandas_FILENAME = "test_pandas.html" - - ray_df.to_html(TEST_HTML_DF_FILENAME) - pandas_df.to_html(TEST_HTML_pandas_FILENAME) - - assert(test_files_eq(TEST_HTML_DF_FILENAME, - TEST_HTML_pandas_FILENAME)) - - teardown_test_file(TEST_HTML_pandas_FILENAME) - teardown_test_file(TEST_HTML_DF_FILENAME) - - -def test_to_json(): - ray_df = create_test_ray_dataframe() - pandas_df = create_test_pandas_dataframe() - - TEST_JSON_DF_FILENAME = "test_df.json" - TEST_JSON_pandas_FILENAME = "test_pandas.json" - - ray_df.to_json(TEST_JSON_DF_FILENAME) - pandas_df.to_json(TEST_JSON_pandas_FILENAME) - - assert(test_files_eq(TEST_JSON_DF_FILENAME, - TEST_JSON_pandas_FILENAME)) - - teardown_test_file(TEST_JSON_pandas_FILENAME) - teardown_test_file(TEST_JSON_DF_FILENAME) - - -def test_to_latex(): - ray_df = create_test_ray_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.to_latex() - - -def test_to_msgpack(): - ray_df = create_test_ray_dataframe() - pandas_df = create_test_pandas_dataframe() - - TEST_MSGPACK_DF_FILENAME = "test_df.msgpack" - TEST_MSGPACK_pandas_FILENAME = "test_pandas.msgpack" - - ray_df.to_msgpack(TEST_MSGPACK_DF_FILENAME) - pandas_df.to_msgpack(TEST_MSGPACK_pandas_FILENAME) - - assert(test_files_eq(TEST_MSGPACK_DF_FILENAME, - TEST_MSGPACK_pandas_FILENAME)) - - teardown_test_file(TEST_MSGPACK_pandas_FILENAME) - teardown_test_file(TEST_MSGPACK_DF_FILENAME) - - -def test_to_panel(): - ray_df = create_test_ray_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.to_panel() - - -def test_to_parquet(): - ray_df = create_test_ray_dataframe() - pandas_df = create_test_pandas_dataframe() - - TEST_PARQUET_DF_FILENAME = "test_df.parquet" - TEST_PARQUET_pandas_FILENAME = "test_pandas.parquet" - - ray_df.to_parquet(TEST_PARQUET_DF_FILENAME) - pandas_df.to_parquet(TEST_PARQUET_pandas_FILENAME) - - assert(test_files_eq(TEST_PARQUET_DF_FILENAME, - TEST_PARQUET_pandas_FILENAME)) - - teardown_test_file(TEST_PARQUET_pandas_FILENAME) - teardown_test_file(TEST_PARQUET_DF_FILENAME) - - -def test_to_period(): - ray_df = create_test_ray_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.to_period() - - -def test_to_pickle(): - ray_df = create_test_ray_dataframe() - pandas_df = create_test_pandas_dataframe() - - TEST_PICKLE_DF_FILENAME = "test_df.pkl" - TEST_PICKLE_pandas_FILENAME = "test_pandas.pkl" - - ray_df.to_pickle(TEST_PICKLE_DF_FILENAME) - pandas_df.to_pickle(TEST_PICKLE_pandas_FILENAME) - - assert(test_files_eq(TEST_PICKLE_DF_FILENAME, - TEST_PICKLE_pandas_FILENAME)) - - teardown_test_file(TEST_PICKLE_pandas_FILENAME) - teardown_test_file(TEST_PICKLE_DF_FILENAME) - - -def test_to_sql(): - ray_df = create_test_ray_dataframe() - pandas_df = create_test_pandas_dataframe() - - TEST_SQL_DF_FILENAME = "test_df.sql" - TEST_SQL_pandas_FILENAME = "test_pandas.sql" - - ray_df.to_pickle(TEST_SQL_DF_FILENAME) - pandas_df.to_pickle(TEST_SQL_pandas_FILENAME) - - assert(test_files_eq(TEST_SQL_DF_FILENAME, - TEST_SQL_pandas_FILENAME)) - - teardown_test_file(TEST_SQL_DF_FILENAME) - teardown_test_file(TEST_SQL_pandas_FILENAME) - - -def test_to_stata(): - ray_df = create_test_ray_dataframe() - pandas_df = create_test_pandas_dataframe() - - TEST_STATA_DF_FILENAME = "test_df.stata" - TEST_STATA_pandas_FILENAME = "test_pandas.stata" - - ray_df.to_stata(TEST_STATA_DF_FILENAME) - pandas_df.to_stata(TEST_STATA_pandas_FILENAME) - - assert(test_files_eq(TEST_STATA_DF_FILENAME, - TEST_STATA_pandas_FILENAME)) - - teardown_test_file(TEST_STATA_pandas_FILENAME) - teardown_test_file(TEST_STATA_DF_FILENAME) diff --git a/python/ray/dataframe/test/test_series.py b/python/ray/dataframe/test/test_series.py deleted file mode 100644 index bf26e32e8..000000000 --- a/python/ray/dataframe/test/test_series.py +++ /dev/null @@ -1,1992 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import pytest -import ray.dataframe as pd - - -@pytest.fixture -def create_test_series(): - return pd.Series(None) - - -def test_T(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.T - - -def test___abs__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__abs__() - - -def test___add__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__add__(None, None) - - -def test___and__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__and__(None) - - -def test___array__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__array__(None) - - -def test___array_prepare__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__array_prepare__(None) - - -def test___array_priority__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__array_priority__ - - -def test___array_wrap__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__array_wrap__(None) - - -def test___bool__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__bool__() - - -def test___bytes__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__bytes__() - - -def test___class__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__class__(None, None, None, None, None) - - -def test___contains__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__contains__(None) - - -def test___copy__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__copy__(None) - - -def test___deepcopy__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__deepcopy__(None) - - -def test___delitem__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__delitem__(None) - - -def test___div__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__div__(None, None) - - -def test___divmod__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__divmod__(None, None) - - -def test___doc__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__doc__ - - -def test___eq__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__eq__(None) - - -def test___finalize__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__finalize__(None, None) - - -def test___float__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__float__() - - -def test___floordiv__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__floordiv__(None, None) - - -def test___ge__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__ge__(None) - - -def test___getitem__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__getitem__(None) - - -def test___getstate__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__getstate__() - - -def test___gt__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__gt__(None) - - -def test___iadd__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__iadd__(None) - - -def test___imul__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__imul__(None) - - -def test___int__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__int__() - - -def test___invert__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__invert__() - - -def test___ipow__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__ipow__(None) - - -def test___isub__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__isub__(None) - - -def test___iter__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__iter__() - - -def test___itruediv__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__itruediv__(None) - - -def test___le__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__le__(None) - - -def test___len__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__len__() - - -def test___long__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__long__() - - -def test___lt__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__lt__(None) - - -def test___mod__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__mod__(None, None) - - -def test___mul__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__mul__(None, None) - - -def test___ne__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__ne__(None) - - -def test___neg__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__neg__() - - -def test___nonzero__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__nonzero__() - - -def test___or__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__or__(None) - - -def test___pow__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__pow__(None, None) - - -def test___repr__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__repr__() - - -def test___round__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__round__(None) - - -def test___setitem__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__setitem__(None, None) - - -def test___setstate__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__setstate__(None) - - -def test___sizeof__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__sizeof__() - - -def test___str__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__str__() - - -def test___sub__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__sub__(None, None) - - -def test___truediv__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__truediv__(None, None) - - -def test___xor__(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.__xor__(None) - - -def test_abs(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.abs() - - -def test_add(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.add(None, None, None) - - -def test_add_prefix(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.add_prefix(None) - - -def test_add_suffix(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.add_suffix(None) - - -def test_agg(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.agg(None, None, None) - - -def test_aggregate(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.aggregate(None, None, None) - - -def test_align(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.align(None, None, None, None, None, None, None, None, None) - - -def test_all(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.all(None, None, None, None) - - -def test_any(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.any(None, None, None, None) - - -def test_append(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.append(None, None) - - -def test_apply(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.apply(None, None, None) - - -def test_argmax(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.argmax(None, None, None) - - -def test_argmin(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.argmin(None, None, None) - - -def test_argsort(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.argsort(None, None) - - -def test_as_blocks(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.as_blocks(None) - - -def test_as_matrix(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.as_matrix(None) - - -def test_asfreq(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.asfreq(None, None, None, None) - - -def test_asobject(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.asobject - - -def test_asof(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.asof(None) - - -def test_astype(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.astype(None, None, None) - - -def test_at(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.at(None) - - -def test_at_time(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.at_time(None) - - -def test_autocorr(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.autocorr(None) - - -def test_axes(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.axes - - -def test_base(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.base - - -def test_between(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.between(None, None) - - -def test_between_time(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.between_time(None, None, None) - - -def test_bfill(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.bfill(None, None, None) - - -def test_blocks(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.blocks - - -def test_bool(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.bool() - - -def test_clip(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.clip(None, None, None, None) - - -def test_clip_lower(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.clip_lower(None) - - -def test_clip_upper(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.clip_upper(None) - - -def test_combine(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.combine(None, None) - - -def test_combine_first(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.combine_first(None) - - -def test_compound(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.compound(None, None) - - -def test_compress(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.compress(None, None) - - -def test_consolidate(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.consolidate(None) - - -def test_convert_objects(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.convert_objects(None, None, None) - - -def test_copy(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.copy(None) - - -def test_corr(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.corr(None, None) - - -def test_count(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.count(None) - - -def test_cov(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.cov(None) - - -def test_cummax(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.cummax(None, None, None) - - -def test_cummin(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.cummin(None, None, None) - - -def test_cumprod(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.cumprod(None, None, None) - - -def test_cumsum(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.cumsum(None, None, None) - - -def test_data(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.data - - -def test_describe(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.describe(None, None) - - -def test_diff(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.diff(None) - - -def test_div(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.div(None, None, None) - - -def test_divide(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.divide(None, None, None) - - -def test_dot(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.dot(None) - - -def test_drop(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.drop(None, None, None, None) - - -def test_drop_duplicates(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.drop_duplicates(None) - - -def test_dropna(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.dropna(None, None) - - -def test_dtype(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.dtype - - -def test_dtypes(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.dtypes - - -def test_duplicated(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.duplicated(None) - - -def test_empty(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.empty - - -def test_eq(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.eq(None, None, None) - - -def test_equals(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.equals(None) - - -def test_ewm(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ewm(None, None, None, None, None, None, None, None) - - -def test_expanding(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.expanding(None, None, None) - - -def test_factorize(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.factorize(None) - - -def test_ffill(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ffill(None, None, None) - - -def test_fillna(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.fillna(None, None, None, None, None, None) - - -def test_filter(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.filter(None, None, None) - - -def test_first(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.first(None) - - -def test_first_valid_index(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.first_valid_index() - - -def test_flags(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.flags - - -def test_floordiv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.floordiv(None, None, None) - - -def test_from_array(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.from_array(None, None, None, None, None) - - -def test_from_csv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.from_csv(None, None, None, None, None, None) - - -def test_ftype(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ftype - - -def test_ftypes(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ftypes - - -def test_ge(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ge(None, None, None) - - -def test_get(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.get(None) - - -def test_get_dtype_counts(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.get_dtype_counts() - - -def test_get_ftype_counts(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.get_ftype_counts() - - -def test_get_value(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.get_value(None) - - -def test_get_values(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.get_values() - - -def test_groupby(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.groupby(None, None, None, None, None, None, None) - - -def test_gt(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.gt(None, None, None) - - -def test_hasnans(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.hasnans - - -def test_head(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.head(None) - - -def test_hist(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.hist(None, None, None, None, None, None, None, None, None) - - -def test_iat(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.iat(None) - - -def test_idxmax(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.idxmax(None, None, None) - - -def test_idxmin(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.idxmin(None, None, None) - - -def test_iloc(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.iloc(None) - - -def test_imag(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.imag - - -def test_index(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.index - - -def test_interpolate(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.interpolate(None, None, None, None, None, None) - - -def test_is_copy(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.is_copy - - -def test_is_monotonic(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.is_monotonic - - -def test_is_monotonic_decreasing(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.is_monotonic_decreasing - - -def test_is_monotonic_increasing(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.is_monotonic_increasing - - -def test_is_unique(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.is_unique - - -def test_isin(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.isin(None) - - -def test_isnull(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.isnull() - - -def test_item(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.item() - - -def test_items(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.items() - - -def test_itemsize(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.itemsize - - -def test_iteritems(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.iteritems() - - -def test_ix(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ix(None) - - -def test_keys(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.keys() - - -def test_kurt(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.kurt(None, None, None, None) - - -def test_kurtosis(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.kurtosis(None, None, None, None) - - -def test_last(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.last(None) - - -def test_last_valid_index(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.last_valid_index() - - -def test_le(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.le(None, None, None) - - -def test_loc(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.loc(None) - - -def test_lt(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.lt(None, None, None) - - -def test_mad(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.mad(None, None) - - -def test_map(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.map(None) - - -def test_mask(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.mask(None, None, None, None, None, None) - - -def test_max(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.max(None, None, None, None) - - -def test_mean(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.mean(None, None, None, None) - - -def test_median(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.median(None, None, None, None) - - -def test_memory_usage(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.memory_usage(None) - - -def test_min(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.min(None, None, None, None) - - -def test_mod(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.mod(None, None, None) - - -def test_mode(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.mode() - - -def test_mul(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.mul(None, None, None) - - -def test_multiply(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.multiply(None, None, None) - - -def test_name(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.name - - -def test_nbytes(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.nbytes - - -def test_ndim(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ndim - - -def test_ne(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ne(None, None, None) - - -def test_nlargest(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.nlargest(None) - - -def test_nonzero(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.nonzero() - - -def test_notnull(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.notnull() - - -def test_nsmallest(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.nsmallest(None) - - -def test_nunique(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.nunique(None) - - -def test_pct_change(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.pct_change(None, None, None, None) - - -def test_pipe(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.pipe(None, None) - - -def test_plot(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.plot(None, None, None, None, None, None, None, None, None, - None, None, None, None, None, None, None, None, None, - None, None, None, None, None) - - -def test_pop(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.pop(None) - - -def test_pow(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.pow(None, None, None) - - -def test_prod(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.prod(None, None, None, None) - - -def test_product(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.product(None, None, None, None) - - -def test_ptp(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ptp(None, None, None, None) - - -def test_put(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.put(None) - - -def test_quantile(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.quantile(None) - - -def test_radd(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.radd(None, None, None) - - -def test_rank(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rank(None, None, None, None, None) - - -def test_ravel(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ravel(None) - - -def test_rdiv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rdiv(None, None, None) - - -def test_real(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.real - - -def test_reindex(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.reindex(None) - - -def test_reindex_axis(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.reindex_axis(None, None) - - -def test_reindex_like(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.reindex_like(None, None, None, None) - - -def test_rename(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rename(None) - - -def test_rename_axis(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rename_axis(None, None, None) - - -def test_reorder_levels(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.reorder_levels(None) - - -def test_repeat(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.repeat(None, None) - - -def test_replace(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.replace(None, None, None, None, None, None) - - -def test_resample(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.resample(None, None, None, None, None, None, None, None, - None, None, None, None) - - -def test_reset_index(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.reset_index(None, None, None) - - -def test_reshape(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.reshape(None) - - -def test_rfloordiv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rfloordiv(None, None, None) - - -def test_rmod(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rmod(None, None, None) - - -def test_rmul(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rmul(None, None, None) - - -def test_rolling(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rolling(None, None, None, None, None, None, None) - - -def test_round(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.round(None, None) - - -def test_rpow(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rpow(None, None, None) - - -def test_rsub(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rsub(None, None, None) - - -def test_rtruediv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rtruediv(None, None, None) - - -def test_sample(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sample(None, None, None, None, None) - - -def test_searchsorted(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.searchsorted(None, None) - - -def test_select(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.select(None) - - -def test_sem(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sem(None, None, None, None, None) - - -def test_set_axis(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.set_axis(None, None) - - -def test_set_value(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.set_value(None, None) - - -def test_shape(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.shape - - -def test_shift(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.shift(None, None) - - -def test_size(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.size - - -def test_skew(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.skew(None, None, None, None) - - -def test_slice_shift(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.slice_shift(None) - - -def test_sort_index(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sort_index(None, None, None, None, None, None) - - -def test_sort_values(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sort_values(None, None, None, None) - - -def test_sortlevel(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sortlevel(None, None) - - -def test_squeeze(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.squeeze(None) - - -def test_std(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.std(None, None, None, None, None) - - -def test_strides(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.strides - - -def test_sub(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sub(None, None, None) - - -def test_subtract(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.subtract(None, None, None) - - -def test_sum(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sum(None, None, None, None) - - -def test_swapaxes(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.swapaxes(None, None) - - -def test_swaplevel(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.swaplevel(None, None) - - -def test_tail(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.tail(None) - - -def test_take(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.take(None, None, None, None) - - -def test_to_clipboard(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_clipboard(None, None) - - -def test_to_csv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_csv(None, None, None, None, None, None, None, None, - None, None) - - -def test_to_dense(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_dense() - - -def test_to_dict(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_dict() - - -def test_to_excel(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_excel(None, None, None, None, None, None, None, None, - None, None, None, None, None, None) - - -def test_to_frame(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_frame(None) - - -def test_to_hdf(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_hdf(None, None) - - -def test_to_json(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_json(None, None, None, None, None, None, None) - - -def test_to_latex(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_latex(None, None, None, None, None, None, None, None, - None, None, None, None, None, None, None, None, - None, None) - - -def test_to_msgpack(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_msgpack(None, None) - - -def test_to_period(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_period(None) - - -def test_to_pickle(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_pickle(None) - - -def test_to_sparse(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_sparse(None) - - -def test_to_sql(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_sql(None, None, None, None, None, None, None, None) - - -def test_to_string(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_string(None, None, None, None, None, None, None, None) - - -def test_to_timestamp(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_timestamp(None, None) - - -def test_to_xarray(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_xarray() - - -def test_tolist(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.tolist() - - -def test_transform(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.transform(None, None) - - -def test_transpose(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.transpose(None) - - -def test_truediv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.truediv(None, None, None) - - -def test_truncate(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.truncate(None, None, None) - - -def test_tshift(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.tshift(None, None) - - -def test_tz_convert(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.tz_convert(None, None, None) - - -def test_tz_localize(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.tz_localize(None, None, None, None) - - -def test_unique(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.unique() - - -def test_unstack(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.unstack(None) - - -def test_update(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.update(None) - - -def test_valid(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.valid(None) - - -def test_value_counts(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.value_counts(None, None, None, None) - - -def test_values(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.values - - -def test_var(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.var(None, None, None, None, None) - - -def test_view(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.view(None) - - -def test_where(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.where(None, None, None, None, None, None) - - -def test_xs(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.xs(None, None, None) diff --git a/python/ray/dataframe/utils.py b/python/ray/dataframe/utils.py deleted file mode 100644 index 746ea5915..000000000 --- a/python/ray/dataframe/utils.py +++ /dev/null @@ -1,577 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import collections -import pandas -import numpy as np -import ray - -from . import get_npartitions - - -_NAN_BLOCKS = {} -_MEMOIZER_CAPACITY = 1000 # Capacity per function - - -class LRUCache: - """A LRUCache implemented with collections.OrderedDict - - Notes: - - OrderedDict will record the order each item is inserted. - - The head of the queue will be LRU items. - """ - - def __init__(self, capacity): - self.capacity = capacity - self.cache = collections.OrderedDict() - - def __contains__(self, key): - return key in self.cache - - def __getitem__(self, key): - """Retrieve item from cache and re-insert it to the back of the queue - """ - value = self.cache.pop(key) - self.cache[key] = value - return value - - def __setitem__(self, key, value): - if key in self.cache: - self.cache.pop(key) - - if len(self.cache) >= self.capacity: - # Pop oldest items at the beginning of the queue - self.cache.popitem(last=False) - - self.cache[key] = value - - -class memoize: - """A basic memoizer that cache the input and output of the remote function - - Notes: - - How is this implemented? - This meoizer is implemented by adding a caching layer to the remote - function's remote attribute. When user call f.remote(*args), we will - first check against the cache, and then call the ray remote function - if we can't find the return value in the cache. - - When should this be used? - This should be used when we anticipate temporal locality for the - function. For example, we can reasonally assume users will perform - columnar operation repetitively over time (like sum() or loc[]). - - Caveat - Don't use this decorator if the any argument to the remote function - will mutate. Following snippet will fail - ```py - @memoize - @ray.remote - def f(obj): - ... - - mutable_obj = [1] - oid_1 = f.remote(mutable_obj) # will be cached - - mutable_obj.append(3) - oid_2 = f.remote(mutable_obj) # cache hit! - - oid_1 == oid_2 # True! - ``` - In short, use this function sparingly. The ideal case is that all - inputs are ray ObjectIDs because they are immutable objects. - - Future Development - - Fix the mutability bug - - Dynamic cache size (Fixed as 1000 for now) - """ - - def __init__(self, f): - # Save of remote function - self.old_remote_func = f.remote - self.cache = LRUCache(capacity=_MEMOIZER_CAPACITY) - - def remote(self, *args): - """Return cached result if the arguments are cached - """ - args = tuple(args) - - if args in self.cache: - cached_result = self.cache[args] - return cached_result - - result = self.old_remote_func(*args) - self.cache[args] = result - return result - - -def _get_nan_block_id(n_row=1, n_col=1, transpose=False): - """A memory efficent way to get a block of NaNs. - - Args: - n_rows(int): number of rows - n_col(int): number of columns - transpose(bool): if true, swap rows and columns - Returns: - ObjectID of the NaN block - """ - global _NAN_BLOCKS - if transpose: - n_row, n_col = n_col, n_row - shape = (n_row, n_col) - if shape not in _NAN_BLOCKS: - arr = np.tile(np.array(np.NaN), shape) - _NAN_BLOCKS[shape] = ray.put(pandas.DataFrame(data=arr)) - return _NAN_BLOCKS[shape] - - -def _get_lengths(df): - """Gets the length of the dataframe. - Args: - df: A remote pandas.DataFrame object. - Returns: - Returns an integer length of the dataframe object. If the attempt - fails, returns 0 as the length. - """ - try: - return len(df) - # Because we sometimes have cases where we have summary statistics in our - # DataFrames - except TypeError: - return 0 - - -def _get_widths(df): - """Gets the width (number of columns) of the dataframe. - Args: - df: A remote pandas.DataFrame object. - Returns: - Returns an integer width of the dataframe object. If the attempt - fails, returns 0 as the length. - """ - try: - return len(df.columns) - # Because we sometimes have cases where we have summary statistics in our - # DataFrames - except TypeError: - return 0 - - -def _partition_pandas_dataframe(df, num_partitions=None, row_chunksize=None): - """Partitions a Pandas DataFrame object. - Args: - df (pandas.DataFrame): The pandas DataFrame to convert. - npartitions (int): The number of partitions to split the DataFrame - into. Has priority over chunksize. - row_chunksize (int): The number of rows to put in each partition. - Returns: - [ObjectID]: A list of object IDs corresponding to the dataframe - partitions - """ - if num_partitions is not None: - row_chunksize = len(df) // num_partitions \ - if len(df) % num_partitions == 0 \ - else len(df) // num_partitions + 1 - else: - assert row_chunksize is not None - - temp_df = df - - row_partitions = [] - while len(temp_df) > row_chunksize: - t_df = temp_df[:row_chunksize] - # reset_index here because we want a pandas.RangeIndex - # within the partitions. It is smaller and sometimes faster. - t_df.reset_index(drop=True, inplace=True) - t_df.columns = pandas.RangeIndex(0, len(t_df.columns)) - top = ray.put(t_df) - row_partitions.append(top) - temp_df = temp_df[row_chunksize:] - else: - # Handle the last chunk correctly. - # This call is necessary to prevent modifying original df - temp_df = temp_df[:] - temp_df.reset_index(drop=True, inplace=True) - temp_df.columns = pandas.RangeIndex(0, len(temp_df.columns)) - row_partitions.append(ray.put(temp_df)) - - return row_partitions - - -def from_pandas(df, num_partitions=None, chunksize=None): - """Converts a pandas DataFrame to a Ray DataFrame. - Args: - df (pandas.DataFrame): The pandas DataFrame to convert. - num_partitions (int): The number of partitions to split the DataFrame - into. Has priority over chunksize. - chunksize (int): The number of rows to put in each partition. - Returns: - A new Ray DataFrame object. - """ - from .dataframe import DataFrame - - row_partitions = \ - _partition_pandas_dataframe(df, num_partitions, chunksize) - - return DataFrame(row_partitions=row_partitions, - columns=df.columns, - index=df.index) - - -def to_pandas(df): - """Converts a Ray DataFrame to a pandas DataFrame/Series. - Args: - df (ray.DataFrame): The Ray DataFrame to convert. - Returns: - A new pandas DataFrame. - """ - pandas_df = pandas.concat(ray.get(df._row_partitions), copy=False) - pandas_df.index = df.index - pandas_df.columns = df.columns - return pandas_df - - -@ray.remote -def extractor(df_chunk, row_loc, col_loc): - """Retrieve an item from remote block - """ - # We currently have to do the writable flag trick because a pandas bug - # https://github.com/pandas-dev/pandas/issues/17192 - try: - row_loc.flags.writeable = True - col_loc.flags.writeable = True - except AttributeError: - # Locators might be scaler or python list - pass - return df_chunk.iloc[row_loc, col_loc] - - -@ray.remote -def writer(df_chunk, row_loc, col_loc, item): - """Make a copy of the block and write new item to it - """ - df_chunk = df_chunk.copy() - df_chunk.iloc[row_loc, col_loc] = item - return df_chunk - - -def _mask_block_partitions(blk_partitions, row_metadata, col_metadata): - """Return the squeezed/expanded block partitions as defined by - row_metadata and col_metadata. - - Note: - Very naive implementation. Extract one scaler at a time in a double - for loop. - """ - col_df = col_metadata._coord_df - row_df = row_metadata._coord_df - - result_oids = [] - shape = (len(row_df.index), len(col_df.index)) - - for _, row_partition_data in row_df.iterrows(): - for _, col_partition_data in col_df.iterrows(): - row_part = row_partition_data.partition - col_part = col_partition_data.partition - block_oid = blk_partitions[row_part, col_part] - - row_idx = row_partition_data['index_within_partition'] - col_idx = col_partition_data['index_within_partition'] - - result_oid = extractor.remote(block_oid, [row_idx], [col_idx]) - result_oids.append(result_oid) - return np.array(result_oids).reshape(shape) - - -@ray.remote -def _deploy_func(func, dataframe, *args): - """Deploys a function for the _map_partitions call. - Args: - dataframe (pandas.DataFrame): The pandas DataFrame for this partition. - Returns: - A futures object representing the return value of the function - provided. - """ - if len(args) == 0: - return func(dataframe) - else: - return func(dataframe, *args) - - -def _map_partitions(func, partitions, *argslists): - """Apply a function across the specified axis - - Args: - func (callable): The function to apply - partitions ([ObjectID]): The list of partitions to map func on. - - Returns: - A list of partitions ([ObjectID]) with the result of the function - """ - if partitions is None: - return None - - assert(callable(func)) - if len(argslists) == 0: - return [_deploy_func.remote(func, part) for part in partitions] - elif len(argslists) == 1: - return [_deploy_func.remote(func, part, argslists[0]) - for part in partitions] - else: - assert(all(len(args) == len(partitions) for args in argslists)) - return [_deploy_func.remote(func, *args) - for args in zip(partitions, *argslists)] - - -@ray.remote -def _build_col_widths(df_col): - """Compute widths (# of columns) for each partition.""" - widths = np.array(ray.get([_deploy_func.remote(_get_widths, d) - for d in df_col])) - - return widths - - -@ray.remote -def _build_row_lengths(df_row): - """Compute lengths (# of rows) for each partition.""" - lengths = np.array(ray.get([_deploy_func.remote(_get_lengths, d) - for d in df_row])) - - return lengths - - -@ray.remote -def _build_coord_df(lengths, index): - """Build the coordinate dataframe over all partitions.""" - filtered_lengths = [x for x in lengths if x > 0] - coords = None - if len(filtered_lengths) > 0: - coords = np.vstack([np.column_stack((np.full(l, i), np.arange(l))) - for i, l in enumerate(filtered_lengths)]) - col_names = ("partition", "index_within_partition") - return pandas.DataFrame(coords, index=index, columns=col_names) - - -def _create_block_partitions(partitions, axis=0, length=None): - - if length is not None and length != 0 and get_npartitions() > length: - npartitions = length - elif length == 0: - npartitions = 1 - else: - npartitions = get_npartitions() - - x = [create_blocks._submit(args=(partition, npartitions, axis), - num_return_vals=npartitions) - for partition in partitions] - - # In the case that axis is 1 we have to transpose because we build the - # columns into rows. Fortunately numpy is efficient at this. - blocks = np.array(x) if axis == 0 else np.array(x).T - - # Sometimes we only get a single column or row, which is - # problematic for building blocks from the partitions, so we - # add whatever dimension we're missing from the input. - return fix_blocks_dimensions(blocks, axis) - - -@ray.remote -def create_blocks(df, npartitions, axis): - return create_blocks_helper(df, npartitions, axis) - - -def create_blocks_helper(df, npartitions, axis): - # Single partition dataframes don't need to be repartitioned - if npartitions == 1: - return df - # In the case that the size is not a multiple of the number of partitions, - # we need to add one to each partition to avoid losing data off the end - block_size = df.shape[axis ^ 1] // npartitions \ - if df.shape[axis ^ 1] % npartitions == 0 \ - else df.shape[axis ^ 1] // npartitions + 1 - - # if not isinstance(df.columns, pandas.RangeIndex): - # df.columns = pandas.RangeIndex(0, len(df.columns)) - - blocks = [df.iloc[:, i * block_size: (i + 1) * block_size] - if axis == 0 - else df.iloc[i * block_size: (i + 1) * block_size, :] - for i in range(npartitions)] - - for block in blocks: - block.columns = pandas.RangeIndex(0, len(block.columns)) - block.reset_index(inplace=True, drop=True) - return blocks - - -@memoize -@ray.remote -def _blocks_to_col(*partition): - if len(partition): - return pandas.concat(partition, axis=0, copy=False)\ - .reset_index(drop=True) - else: - return pandas.Series() - - -@memoize -@ray.remote -def _blocks_to_row(*partition): - row_part = pandas.concat(partition, axis=1, copy=False)\ - .reset_index(drop=True) - # Because our block partitions contain different indices (for the - # columns), this change is needed to ensure correctness. - row_part.columns = pandas.RangeIndex(0, len(row_part.columns)) - return row_part - - -def _inherit_docstrings(parent, excluded=[]): - """Creates a decorator which overwrites a decorated class' __doc__ - attribute with parent's __doc__ attribute. Also overwrites __doc__ of - methods and properties defined in the class with the __doc__ of matching - methods and properties in parent. - - Args: - parent (object): Class from which the decorated class inherits __doc__. - excluded (list): List of parent objects from which the class does not - inherit docstrings. - - Returns: - function: decorator which replaces the decorated class' documentation - parent's documentation. - """ - def decorator(cls): - if parent not in excluded: - cls.__doc__ = parent.__doc__ - for attr, obj in cls.__dict__.items(): - parent_obj = getattr(parent, attr, None) - if parent_obj in excluded or \ - (not callable(parent_obj) and - not isinstance(parent_obj, property)): - continue - if callable(obj): - obj.__doc__ = parent_obj.__doc__ - elif isinstance(obj, property) and obj.fget is not None: - p = property(obj.fget, obj.fset, obj.fdel, parent_obj.__doc__) - setattr(cls, attr, p) - - return cls - - return decorator - - -@ray.remote -def _reindex_helper(old_index, new_index, axis, npartitions, *df): - """Reindexes a dataframe to prepare for join/concat. - - Args: - df: The DataFrame partition - old_index: The index/column for this partition. - new_index: The new index/column to assign. - axis: Which axis to reindex over. - - Returns: - A new set of blocks made up of DataFrames. - """ - df = pandas.concat(df, axis=axis ^ 1) - if axis == 1: - df.index = old_index - elif axis == 0: - df.columns = old_index - - df = df.reindex(new_index, copy=False, axis=axis ^ 1) - return create_blocks_helper(df, npartitions, axis) - - -@ray.remote -def _co_op_helper(func, left_columns, right_columns, left_df_len, left_idx, - *zipped): - """Copartition operation where two DataFrames must have aligned indexes. - - NOTE: This function assumes things are already copartitioned. Requires that - row partitions are passed in as blocks. - - Args: - func: The operation to conduct between two DataFrames. - left_columns: The column names for the left DataFrame. - right_columns: The column names for the right DataFrame. - left_df_len: The length of the left. This is used so we can split up - the zipped partitions. - zipped: The DataFrame partitions (in blocks). - - Returns: - A new set of blocks for the partitioned DataFrame. - """ - left = pandas.concat(zipped[:left_df_len], axis=1, copy=False).copy() - left.columns = left_columns - if left_idx is not None: - left.index = left_idx - - right = pandas.concat(zipped[left_df_len:], axis=1, copy=False).copy() - right.columns = right_columns - - new_rows = func(left, right) - - new_blocks = create_blocks_helper(new_rows, left_df_len, 0) - - if left_idx is not None: - new_blocks.append(new_rows.index) - - return new_blocks - - -@ray.remote -def _match_partitioning(column_partition, lengths, index): - """Match the number of rows on each partition. Used in df.merge(). - - NOTE: This function can cause problems when there are empty column - partitions. - - The way this function is intended to be used is as follows: Align the - right partitioning with the left. The left will remain unchanged. Then, - you are free to perform actions on a per-partition basis with the - partitioning. - - The index objects must already be identical for this to work correctly. - - Args: - column_partition: The column partition to change. - lengths: The lengths of each row partition to match to. - index: The index index of the column_partition. This is used to push - down to the inner frame for correctness in the merge. - - Returns: - A list of blocks created from this column partition. - """ - partitioned_list = [] - - columns = column_partition.columns - # We set this because this is the only place we can guarantee correct - # placement. We use it in the case the user wants to join on the index. - column_partition.index = index - for length in lengths: - if len(column_partition) == 0: - partitioned_list.append(pandas.DataFrame(columns=columns)) - continue - - partitioned_list.append(column_partition.iloc[:length, :]) - column_partition = column_partition.iloc[length:, :] - return partitioned_list - - -@ray.remote -def _concat_index(*index_parts): - return index_parts[0].append(index_parts[1:]) - - -def fix_blocks_dimensions(blocks, axis): - """Checks that blocks is 2D, and adds a dimension if not. - """ - if blocks.ndim < 2: - return np.expand_dims(blocks, axis=axis ^ 1) - return blocks - - -@ray.remote -def _compile_remote_dtypes(*column_of_blocks): - small_dfs = [df.loc[0:0] for df in column_of_blocks] - return pandas.concat(small_dfs, copy=False).dtypes