diff --git a/.travis.yml b/.travis.yml
index 466c5a6f8..11479bcf1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -143,12 +143,6 @@ matrix:
- python test/cython_test.py
- python test/credis_test.py
- # ray dataframe tests
- # - python -m pytest python/ray/dataframe/test/test_dataframe.py
- - python -m pytest python/ray/dataframe/test/test_concat.py
- - python -m pytest python/ray/dataframe/test/test_io.py
- # - python -m pytest python/ray/dataframe/test/test_groupby.py
-
# ray tune tests
- python python/ray/tune/test/dependency_test.py
- python -m pytest python/ray/tune/test/trial_runner_test.py
@@ -208,12 +202,6 @@ script:
- python test/cython_test.py
- python test/credis_test.py
- # ray dataframe tests
- - python -m pytest python/ray/dataframe/test/test_dataframe.py
- - python -m pytest python/ray/dataframe/test/test_concat.py
- - python -m pytest python/ray/dataframe/test/test_io.py
- - python -m pytest python/ray/dataframe/test/test_groupby.py
-
# ray tune tests
- python python/ray/tune/test/dependency_test.py
- python -m pytest python/ray/tune/test/trial_runner_test.py
diff --git a/.travis/upgrade-syn.sh b/.travis/upgrade-syn.sh
index 8bfaacd39..a7f7cd8c6 100755
--- a/.travis/upgrade-syn.sh
+++ b/.travis/upgrade-syn.sh
@@ -13,7 +13,6 @@ find \
python test \
-name '*.py' -type f \
-not -path 'python/ray/cloudpickle/*' \
- -not -path 'python/ray/dataframe/*' \
-exec python -m pyupgrade {} +
if ! git diff --quiet; then
diff --git a/.travis/yapf.sh b/.travis/yapf.sh
index 45cabf587..7c12ce4b4 100755
--- a/.travis/yapf.sh
+++ b/.travis/yapf.sh
@@ -24,7 +24,6 @@ YAPF_FLAGS=(
)
YAPF_EXCLUDES=(
- '--exclude' 'python/ray/dataframe/*'
'--exclude' 'python/ray/rllib/*'
'--exclude' 'python/ray/cloudpickle/*'
'--exclude' 'python/build/*'
diff --git a/doc/source/pandas_on_ray.rst b/doc/source/pandas_on_ray.rst
index 59d2fc699..4378d12e4 100644
--- a/doc/source/pandas_on_ray.rst
+++ b/doc/source/pandas_on_ray.rst
@@ -1,71 +1,9 @@
Pandas on Ray
=============
-Pandas on Ray is an early stage DataFrame library that wraps Pandas and
-transparently distributes the data and computation. The user does not need to
-know how many cores their system has, nor do they need to specify how to
-distribute the data. In fact, users can continue using their previous Pandas
-notebooks while experiencing a considerable speedup from Pandas on Ray, even
-on a single machine. Only a modification of the import statement is needed, as
-we demonstrate below. Once you’ve changed your import statement, you’re ready
-to use Pandas on Ray just like you would Pandas.
+**Pandas on Ray has moved to Modin!**
-.. code-block:: python
+Pandas on Ray has moved into the `Modin project`_ with the intention of
+unifying the DataFrame APIs.
- # import pandas as pd
- import ray.dataframe as pd
-
-Currently, we have part of the Pandas API implemented and are working toward
-full functional parity with Pandas.
-
-Using Pandas on Ray on a Single Node
-------------------------------------
-
-In order to use the most up-to-date version of Pandas on Ray, please follow
-the instructions on the `installation page`_
-
-Once you import the library, you should see something similar to the following
-output:
-
-.. code-block:: text
-
- >>> import ray.dataframe as pd
-
- Waiting for redis server at 127.0.0.1:14618 to respond...
- Waiting for redis server at 127.0.0.1:31410 to respond...
- Starting local scheduler with the following resources: {'CPU': 4, 'GPU': 0}.
-
- ======================================================================
- View the web UI at http://localhost:8889/notebooks/ray_ui36796.ipynb?token=ac25867d62c4ae87941bc5a0ecd5f517dbf80bd8e9b04218
- ======================================================================
-
-If you do not see output similar to the above, please make sure that you have
-built Ray using the instructions on the `installation page`_
-
-One you have executed ``import ray.dataframe as pd``, you're ready to begin
-running your Pandas pipeline as you were before. Please note, the API is not
-yet complete. For some methods, you may see the following:
-
-.. code-block:: text
-
- NotImplementedError: To contribute to Pandas on Ray, please visit github.com/ray-project/ray.
-
-If you would like to request a particular method be implemented, feel free to
-`open an issue`_. Before you open an issue please make sure that someone else
-has not already requested that functionality.
-
-Using Pandas on Ray on a Cluster
---------------------------------
-
-Currently, we do not yet support running Pandas on Ray on a cluster. Coming
-Soon!
-
-Examples
---------
-You can find an example on our recent `blog post`_ or on the
-`Jupyter Notebook`_ that we used to create the blog post.
-
-.. _`installation page`: http://ray.readthedocs.io/en/latest/installation.html
-.. _`open an issue`: http://github.com/ray-project/ray/issues
-.. _`blog post`: http://rise.cs.berkeley.edu/blog/pandas-on-ray
-.. _`Jupyter Notebook`: http://gist.github.com/devin-petersohn/f424d9fb5579a96507c709a36d487f24#file-pandas_on_ray_blog_post_0-ipynb
+.. _`Modin project`: https://github.com/modin-project/modin
diff --git a/python/ray/dataframe/__init__.py b/python/ray/dataframe/__init__.py
index d71ac1e3d..a22f7ff61 100644
--- a/python/ray/dataframe/__init__.py
+++ b/python/ray/dataframe/__init__.py
@@ -1,60 +1,2 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import pandas
-# TODO: In the future `set_option` or similar needs to run on every node
-# in order to keep all pandas instances across nodes consistent
-from pandas import (eval, unique, value_counts, cut, to_numeric, factorize,
- test, qcut, match, Panel, date_range, Index, MultiIndex,
- CategoricalIndex, Series, bdate_range, DatetimeIndex,
- Timedelta, Timestamp, to_timedelta, set_eng_float_format,
- set_option, NaT, PeriodIndex, Categorical)
-import threading
-
-pandas_version = pandas.__version__
-pandas_major = int(pandas_version.split(".")[0])
-pandas_minor = int(pandas_version.split(".")[1])
-
-if pandas_major == 0 and pandas_minor != 22:
- raise Exception("In order to use Pandas on Ray, your pandas version must "
- "be 0.22. You can run 'pip install pandas==0.22'")
-
-DEFAULT_NPARTITIONS = 8
-
-
-def set_npartition_default(n):
- global DEFAULT_NPARTITIONS
- DEFAULT_NPARTITIONS = n
-
-
-def get_npartitions():
- return DEFAULT_NPARTITIONS
-
-
-# We import these file after above two function
-# because they depend on npartitions.
-from .dataframe import DataFrame # noqa: 402
-from .io import (read_csv, read_parquet, read_json, read_html, # noqa: 402
- read_clipboard, read_excel, read_hdf, read_feather, # noqa: 402
- read_msgpack, read_stata, read_sas, read_pickle, # noqa: 402
- read_sql) # noqa: 402
-from .concat import concat # noqa: 402
-from .datetimes import to_datetime # noqa: 402
-from .reshape import get_dummies # noqa: 402
-
-__all__ = [
- "DataFrame", "Series", "read_csv", "read_parquet", "concat", "eval",
- "unique", "value_counts", "cut", "to_numeric", "factorize", "test", "qcut",
- "match", "to_datetime", "get_dummies", "Panel", "date_range", "Index",
- "MultiIndex", "Series", "bdate_range", "DatetimeIndex", "to_timedelta",
- "set_eng_float_format", "set_option", "CategoricalIndex", "Timedelta",
- "Timestamp", "NaT", "PeriodIndex", "Categorical"
-]
-
-try:
- if threading.current_thread().name == "MainThread":
- import ray
- ray.init()
-except AssertionError:
- pass
+raise DeprecationWarning("Pandas on Ray has moved to Modin: "
+ "github.com/modin-project/modin")
diff --git a/python/ray/dataframe/concat.py b/python/ray/dataframe/concat.py
deleted file mode 100644
index ef3686284..000000000
--- a/python/ray/dataframe/concat.py
+++ /dev/null
@@ -1,133 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import pandas
-import numpy as np
-from .dataframe import DataFrame
-from .utils import _reindex_helper
-
-
-def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
- keys=None, levels=None, names=None, verify_integrity=False,
- copy=True):
-
- if keys is not None:
- objs = [objs[k] for k in keys]
- else:
- objs = list(objs)
-
- if len(objs) == 0:
- raise ValueError("No objects to concatenate")
-
- objs = [obj for obj in objs if obj is not None]
-
- if len(objs) == 0:
- raise ValueError("All objects passed were None")
-
- try:
- type_check = next(obj for obj in objs
- if not isinstance(obj, (pandas.Series,
- pandas.DataFrame,
- DataFrame)))
- except StopIteration:
- type_check = None
- if type_check is not None:
- raise ValueError("cannot concatenate object of type \"{0}\"; only "
- "pandas.Series, pandas.DataFrame, "
- "and ray.dataframe.DataFrame objs are "
- "valid", type(type_check))
-
- all_series = all(isinstance(obj, pandas.Series)
- for obj in objs)
- if all_series:
- return DataFrame(pandas.concat(objs, axis, join, join_axes,
- ignore_index, keys, levels, names,
- verify_integrity, copy))
-
- if isinstance(objs, dict):
- raise NotImplementedError(
- "Obj as dicts not implemented. To contribute to "
- "Pandas on Ray, please visit github.com/ray-project/ray.")
-
- axis = pandas.DataFrame()._get_axis_number(axis)
-
- if join not in ['inner', 'outer']:
- raise ValueError("Only can inner (intersect) or outer (union) join the"
- " other axis")
-
- # We need this in a list because we use it later.
- all_index, all_columns = list(zip(*[(obj.index, obj.columns)
- for obj in objs]))
-
- def series_to_df(series, columns):
- df = pandas.DataFrame(series)
- df.columns = columns
- return DataFrame(df)
-
- # Pandas puts all of the Series in a single column named 0. This is
- # true regardless of the existence of another column named 0 in the
- # concat.
- if axis == 0:
- objs = [series_to_df(obj, [0])
- if isinstance(obj, pandas.Series) else obj for obj in objs]
- else:
- # Pandas starts the count at 0 so this will increment the names as
- # long as there's a new nameless Series being added.
- def name_incrementer(i):
- val = i[0]
- i[0] += 1
- return val
-
- i = [0]
- objs = [series_to_df(obj, obj.name if obj.name is not None
- else name_incrementer(i))
- if isinstance(obj, pandas.Series) else obj for obj in objs]
-
- # Using concat on the columns and index is fast because they're empty,
- # and it forces the error checking. It also puts the columns in the
- # correct order for us.
- final_index = \
- pandas.concat([pandas.DataFrame(index=idx) for idx in all_index],
- axis=axis, join=join, join_axes=join_axes,
- ignore_index=ignore_index, keys=keys, levels=levels,
- names=names, verify_integrity=verify_integrity,
- copy=False).index
- final_columns = \
- pandas.concat([pandas.DataFrame(columns=col)
- for col in all_columns],
- axis=axis, join=join, join_axes=join_axes,
- ignore_index=ignore_index, keys=keys, levels=levels,
- names=names, verify_integrity=verify_integrity,
- copy=False).columns
-
- # Put all of the DataFrames into Ray format
- # TODO just partition the DataFrames instead of building a new Ray DF.
- objs = [DataFrame(obj) if isinstance(obj, (pandas.DataFrame,
- pandas.Series)) else obj
- for obj in objs]
-
- # Here we reuse all_columns/index so we don't have to materialize objects
- # from remote memory built in the previous line. In the future, we won't be
- # building new DataFrames, rather just partitioning the DataFrames.
- if axis == 0:
- new_blocks = np.array([_reindex_helper._submit(
- args=tuple([all_columns[i], final_columns, axis,
- len(objs[0]._block_partitions)] + part.tolist()),
- num_return_vals=len(objs[0]._block_partitions))
- for i in range(len(objs))
- for part in objs[i]._block_partitions])
- else:
- # Transposing the columns is necessary because the remote task treats
- # everything like rows and returns in row-major format. Luckily, this
- # operation is cheap in numpy.
- new_blocks = np.array([_reindex_helper._submit(
- args=tuple([all_index[i], final_index, axis,
- len(objs[0]._block_partitions.T)] + part.tolist()),
- num_return_vals=len(objs[0]._block_partitions.T))
- for i in range(len(objs))
- for part in objs[i]._block_partitions.T]).T
-
- return DataFrame(block_partitions=new_blocks,
- columns=final_columns,
- index=final_index)
diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py
deleted file mode 100644
index 590f4a993..000000000
--- a/python/ray/dataframe/dataframe.py
+++ /dev/null
@@ -1,5413 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import pandas
-import functools
-from pandas.api.types import is_scalar
-from pandas.util._validators import validate_bool_kwarg
-from pandas.core.index import _ensure_index_from_sequences
-from pandas._libs import lib
-from pandas.core.dtypes.cast import maybe_upcast_putmask
-from pandas import compat
-from pandas.compat import lzip, to_str, string_types, cPickle as pkl
-import pandas.core.common as com
-from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_list_like,
- is_numeric_dtype,
- is_timedelta64_dtype,
- _get_dtype_from_object)
-from pandas.core.indexing import check_bool_indexer
-from pandas.errors import MergeError
-
-import warnings
-import numpy as np
-from numpy.testing import assert_equal
-import ray
-import itertools
-import io
-import sys
-import re
-
-from .utils import (
- _deploy_func,
- _map_partitions,
- _partition_pandas_dataframe,
- to_pandas,
- create_blocks_helper,
- _blocks_to_col,
- _blocks_to_row,
- _create_block_partitions,
- _inherit_docstrings,
- _reindex_helper,
- _co_op_helper,
- _match_partitioning,
- _concat_index,
- fix_blocks_dimensions,
- _compile_remote_dtypes)
-from . import get_npartitions
-from .index_metadata import _IndexMetadata
-from .iterator import PartitionIterator
-
-
-@_inherit_docstrings(pandas.DataFrame,
- excluded=[pandas.DataFrame, pandas.DataFrame.__init__])
-class DataFrame(object):
-
- def __init__(self, data=None, index=None, columns=None, dtype=None,
- copy=False, col_partitions=None, row_partitions=None,
- block_partitions=None, row_metadata=None, col_metadata=None,
- dtypes_cache=None):
- """Distributed DataFrame object backed by Pandas dataframes.
-
- Args:
- data (numpy ndarray (structured or homogeneous) or dict):
- Dict can contain Series, arrays, constants, or list-like
- objects.
- index (pandas.Index, list, ObjectID): The row index for this
- dataframe.
- columns (pandas.Index): The column names for this dataframe, in
- pandas Index object.
- dtype: Data type to force. Only a single dtype is allowed.
- If None, infer
- copy (boolean): Copy data from inputs.
- Only affects DataFrame / 2d ndarray input
- col_partitions ([ObjectID]): The list of ObjectIDs that contain
- the column dataframe partitions.
- row_partitions ([ObjectID]): The list of ObjectIDs that contain the
- row dataframe partitions.
- block_partitions: A 2D numpy array of block partitions.
- row_metadata (_IndexMetadata):
- Metadata for the new dataframe's rows
- col_metadata (_IndexMetadata):
- Metadata for the new dataframe's columns
- """
- if isinstance(data, DataFrame):
- self._frame_data = data._frame_data
- return
-
- self._dtypes_cache = dtypes_cache
-
- # Check type of data and use appropriate constructor
- if data is not None or (col_partitions is None and
- row_partitions is None and
- block_partitions is None):
-
- pandas_df = pandas.DataFrame(data=data, index=index,
- columns=columns, dtype=dtype,
- copy=copy)
-
- # Cache dtypes
- self._dtypes_cache = pandas_df.dtypes
-
- # TODO convert _partition_pandas_dataframe to block partitioning.
- row_partitions = \
- _partition_pandas_dataframe(pandas_df,
- num_partitions=get_npartitions())
-
- self._block_partitions = \
- _create_block_partitions(row_partitions, axis=0,
- length=len(pandas_df.columns))
-
- # Set in case we were only given a single row/column for below.
- axis = 0
- columns = pandas_df.columns
- index = pandas_df.index
- else:
- # created this invariant to make sure we never have to go into the
- # partitions to get the columns
- assert columns is not None or col_metadata is not None, \
- "Columns not defined, must define columns or col_metadata " \
- "for internal DataFrame creations"
-
- if block_partitions is not None:
- axis = 0
- # put in numpy array here to make accesses easier since it's 2D
- self._block_partitions = np.array(block_partitions)
- self._block_partitions = \
- fix_blocks_dimensions(self._block_partitions, axis)
-
- else:
- if row_partitions is not None:
- axis = 0
- partitions = row_partitions
- axis_length = len(columns) if columns is not None else \
- len(col_metadata)
- elif col_partitions is not None:
- axis = 1
- partitions = col_partitions
- axis_length = len(index) if index is not None else \
- len(row_metadata)
- # All partitions will already have correct dtypes
- self._dtypes_cache = [
- _deploy_func.remote(lambda df: df.dtypes, pandas_df)
- for pandas_df in col_partitions
- ]
-
- # TODO: write explicit tests for "short and wide"
- # column partitions
- self._block_partitions = \
- _create_block_partitions(partitions, axis=axis,
- length=axis_length)
-
- assert self._block_partitions.ndim == 2, "Block Partitions must be 2D."
-
- # Create the row and column index objects for using our partitioning.
- # If the objects haven't been inherited, then generate them
- if row_metadata is not None:
- self._row_metadata = row_metadata.copy()
- if index is not None:
- self.index = index
- else:
- self._row_metadata = _IndexMetadata(self._block_partitions[:, 0],
- index=index, axis=0)
-
- if col_metadata is not None:
- self._col_metadata = col_metadata.copy()
- if columns is not None:
- self.columns = columns
- else:
- self._col_metadata = _IndexMetadata(self._block_partitions[0, :],
- index=columns, axis=1)
-
- if self._dtypes_cache is None:
- self._get_remote_dtypes()
-
- def _get_frame_data(self):
- data = {}
- data['blocks'] = self._block_partitions
- data['col_metadata'] = self._col_metadata
- data['row_metadata'] = self._row_metadata
- data['columns'] = self.columns
- data['index'] = self.index
- data['dtypes'] = self._dtypes_cache
-
- return data
-
- def _set_frame_data(self, data):
- self._block_partitions = data['blocks']
- self._col_metadata = data['col_metadata']
- self._row_metadata = data['row_metadata']
- self.columns = data['columns']
- self.index = data['index']
- self._dtypes_cache = data['dtypes']
-
- _frame_data = property(_get_frame_data, _set_frame_data)
-
- def _get_row_partitions(self):
- empty_rows_mask = self._row_metadata._lengths > 0
- if any(empty_rows_mask):
- self._row_metadata._lengths = \
- self._row_metadata._lengths[empty_rows_mask]
- self._block_partitions = self._block_partitions[empty_rows_mask, :]
- return [_blocks_to_row.remote(*part)
- for i, part in enumerate(self._block_partitions)]
-
- def _set_row_partitions(self, new_row_partitions):
- self._block_partitions = \
- _create_block_partitions(new_row_partitions, axis=0,
- length=len(self.columns))
-
- _row_partitions = property(_get_row_partitions, _set_row_partitions)
-
- def _get_col_partitions(self):
- empty_cols_mask = self._col_metadata._lengths > 0
- if any(empty_cols_mask):
- self._col_metadata._lengths = \
- self._col_metadata._lengths[empty_cols_mask]
- self._block_partitions = self._block_partitions[:, empty_cols_mask]
- return [_blocks_to_col.remote(*self._block_partitions[:, i])
- for i in range(self._block_partitions.shape[1])]
-
- def _set_col_partitions(self, new_col_partitions):
- self._block_partitions = \
- _create_block_partitions(new_col_partitions, axis=1,
- length=len(self.index))
-
- _col_partitions = property(_get_col_partitions, _set_col_partitions)
-
- def __str__(self):
- return repr(self)
-
- def _repr_helper_(self):
- if len(self._row_metadata) <= 60 and \
- len(self._col_metadata) <= 20:
- return to_pandas(self)
-
- def head(df, n, get_local_head=False):
- """Compute the head for this without creating a new DataFrame"""
- if get_local_head:
- return df.head(n)
-
- new_dfs = _map_partitions(lambda df: df.head(n),
- df)
-
- index = self.index[:n]
- pandas_head = pandas.concat(ray.get(new_dfs), axis=1, copy=False)
- pandas_head.index = index
- pandas_head.columns = self.columns
- return pandas_head
-
- def tail(df, n, get_local_tail=False):
- """Compute the tail for this without creating a new DataFrame"""
- if get_local_tail:
- return df.tail(n)
-
- new_dfs = _map_partitions(lambda df: df.tail(n),
- df)
-
- index = self.index[-n:]
- pandas_tail = pandas.concat(ray.get(new_dfs), axis=1, copy=False)
- pandas_tail.index = index
- pandas_tail.columns = self.columns
- return pandas_tail
-
- def front(df, n):
- """Get first n columns without creating a new Dataframe"""
-
- cum_col_lengths = self._col_metadata._lengths.cumsum()
- index = np.argmax(cum_col_lengths >= 10)
- pandas_front = pandas.concat(ray.get(x[:index + 1]),
- axis=1, copy=False)
- pandas_front = pandas_front.iloc[:, :n]
- pandas_front.index = self.index
- pandas_front.columns = self.columns[:n]
- return pandas_front
-
- def back(df, n):
- """Get last n columns without creating a new Dataframe"""
-
- cum_col_lengths = np.flip(self._col_metadata._lengths,
- axis=0).cumsum()
- index = np.argmax(cum_col_lengths >= 10)
- pandas_back = pandas.concat(ray.get(x[-(index + 1):]),
- axis=1, copy=False)
- pandas_back = pandas_back.iloc[:, -n:]
- pandas_back.index = self.index
- pandas_back.columns = self.columns[-n:]
- return pandas_back
-
- x = self._col_partitions
- get_local_head = False
-
- # Get first and last 10 columns if there are more than 20 columns
- if len(self._col_metadata) >= 20:
- get_local_head = True
- front = front(x, 10)
- back = back(x, 10)
-
- col_dots = pandas.Series(["..." for _ in range(len(self.index))])
- col_dots.index = self.index
- col_dots.name = "..."
- x = pandas.concat([front, col_dots, back], axis=1, copy=False)
-
- # If less than 60 rows, x is already in the correct format.
- if len(self._row_metadata) < 60:
- return x
-
- head = head(x, 30, get_local_head)
- tail = tail(x, 30, get_local_head)
-
- # Make the dots in between the head and tail
- row_dots = pandas.Series(["..." for _ in range(len(head.columns))])
- row_dots.index = head.columns
- row_dots.name = "..."
-
- # We have to do it this way or convert dots to a dataframe and
- # transpose. This seems better.
- result = head.append(row_dots).append(tail)
- return result
-
- def __repr__(self):
- # We use pandas repr so that we match them.
- if len(self._row_metadata) <= 60 and \
- len(self._col_metadata) <= 20:
- return repr(self._repr_helper_())
- # The split here is so that we don't repr pandas row lengths.
- result = self._repr_helper_()
- final_result = repr(result).rsplit("\n\n", maxsplit=1)[0] + \
- "\n\n[{0} rows x {1} columns]".format(len(self.index),
- len(self.columns))
- return final_result
-
- def _repr_html_(self):
- """repr function for rendering in Jupyter Notebooks like Pandas
- Dataframes.
-
- Returns:
- The HTML representation of a Dataframe.
- """
- # We use pandas _repr_html_ to get a string of the HTML representation
- # of the dataframe.
- if len(self._row_metadata) <= 60 and \
- len(self._col_metadata) <= 20:
- return self._repr_helper_()._repr_html_()
- # We split so that we insert our correct dataframe dimensions.
- result = self._repr_helper_()._repr_html_()
- return result.split("
")[0] + \
- "
{0} rows x {1} columns
\n".format(len(self.index),
- len(self.columns))
-
- def _get_index(self):
- """Get the index for this DataFrame.
-
- Returns:
- The union of all indexes across the partitions.
- """
- return self._row_metadata.index
-
- def _set_index(self, new_index):
- """Set the index for this DataFrame.
-
- Args:
- new_index: The new index to set this
- """
- self._row_metadata.index = new_index
-
- index = property(_get_index, _set_index)
-
- def _get_columns(self):
- """Get the columns for this DataFrame.
-
- Returns:
- The union of all indexes across the partitions.
- """
- return self._col_metadata.index
-
- def _set_columns(self, new_index):
- """Set the columns for this DataFrame.
-
- Args:
- new_index: The new index to set this
- """
- self._col_metadata.index = new_index
-
- columns = property(_get_columns, _set_columns)
-
- def _arithmetic_helper(self, remote_func, axis, level=None):
- # TODO: We don't support `level` right now
- if level is not None:
- raise NotImplementedError("Level not yet supported.")
-
- axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \
- else 0
-
- oid_series = ray.get(_map_partitions(remote_func,
- self._col_partitions if axis == 0
- else self._row_partitions))
-
- if axis == 0:
- # We use the index to get the internal index.
- oid_series = [(oid_series[i], i) for i in range(len(oid_series))]
-
- if len(oid_series) > 0:
- for df, partition in oid_series:
- this_partition = \
- self._col_metadata.partition_series(partition)
- df.index = \
- this_partition[this_partition.isin(df.index)].index
-
- result_series = pandas.concat([obj[0] for obj in oid_series],
- axis=0, copy=False)
- else:
- result_series = pandas.concat(oid_series, axis=0, copy=False)
- result_series.index = self.index
- return result_series
-
- def _validate_eval_query(self, expr, **kwargs):
- """Helper function to check the arguments to eval() and query()
-
- Args:
- expr: The expression to evaluate. This string cannot contain any
- Python statements, only Python expressions.
- """
- if isinstance(expr, str) and expr is '':
- raise ValueError("expr cannot be an empty string")
-
- if isinstance(expr, str) and '@' in expr:
- raise NotImplementedError("Local variables not yet supported in "
- "eval.")
-
- if isinstance(expr, str) and 'not' in expr:
- if 'parser' in kwargs and kwargs['parser'] == 'python':
- raise NotImplementedError("'Not' nodes are not implemented.")
-
- @property
- def size(self):
- """Get the number of elements in the DataFrame.
-
- Returns:
- The number of elements in the DataFrame.
- """
- return len(self.index) * len(self.columns)
-
- @property
- def ndim(self):
- """Get the number of dimensions for this DataFrame.
-
- Returns:
- The number of dimensions for this DataFrame.
- """
- # The number of dimensions is common across all partitions.
- # The first partition will be enough.
- return ray.get(_deploy_func.remote(lambda df: df.ndim,
- self._row_partitions[0]))
-
- @property
- def ftypes(self):
- """Get the ftypes for this DataFrame.
-
- Returns:
- The ftypes for this DataFrame.
- """
- # The ftypes are common across all partitions.
- # The first partition will be enough.
- result = ray.get(_deploy_func.remote(lambda df: df.ftypes,
- self._row_partitions[0]))
- result.index = self.columns
- return result
-
- def _get_remote_dtypes(self):
- """Finds and caches ObjectIDs for the dtypes of each column partition.
- """
- self._dtypes_cache = [_compile_remote_dtypes.remote(*column)
- for column in self._block_partitions.T]
-
- @property
- def dtypes(self):
- """Get the dtypes for this DataFrame.
-
- Returns:
- The dtypes for this DataFrame.
- """
- assert self._dtypes_cache is not None
-
- if isinstance(self._dtypes_cache, list) and \
- isinstance(self._dtypes_cache[0],
- ray.ObjectID):
- self._dtypes_cache = pandas.concat(ray.get(self._dtypes_cache),
- copy=False)
- self._dtypes_cache.index = self.columns
-
- return self._dtypes_cache
-
- @property
- def empty(self):
- """Determines if the DataFrame is empty.
-
- Returns:
- True if the DataFrame is empty.
- False otherwise.
- """
- return len(self.columns) == 0 or len(self.index) == 0
-
- @property
- def values(self):
- """Create a numpy array with the values from this DataFrame.
-
- Returns:
- The numpy representation of this DataFrame.
- """
- return np.concatenate(ray.get(_map_partitions(
- lambda df: df.values, self._row_partitions)))
-
- @property
- def axes(self):
- """Get the axes for the DataFrame.
-
- Returns:
- The axes for the DataFrame.
- """
- return [self.index, self.columns]
-
- @property
- def shape(self):
- """Get the size of each of the dimensions in the DataFrame.
-
- Returns:
- A tuple with the size of each dimension as they appear in axes().
- """
- return len(self.index), len(self.columns)
-
- def _update_inplace(self, row_partitions=None, col_partitions=None,
- block_partitions=None, columns=None, index=None,
- col_metadata=None, row_metadata=None):
- """updates the current DataFrame inplace.
-
- Behavior should be similar to the constructor, given the corresponding
- arguments. Note that len(columns) and len(index) should match the
- corresponding dimensions in the partition(s) passed in, otherwise this
- function will complain.
-
- Args:
- row_partitions ([ObjectID]):
- The new partitions to replace self._row_partitions directly
- col_partitions ([ObjectID]):
- The new partitions to replace self._col_partitions directly
- columns (pandas.Index):
- Index of the column dimension to replace existing columns
- index (pandas.Index):
- Index of the row dimension to replace existing index
-
- Note:
- If `columns` or `index` are not supplied, they will revert to
- default columns or index respectively, as this function does
- not have enough contextual info to rebuild the indexes
- correctly based on the addition/subtraction of rows/columns.
- """
- assert row_partitions is not None or col_partitions is not None\
- or block_partitions is not None, \
- "To update inplace, new column or row partitions must be set."
-
- if block_partitions is not None:
- self._block_partitions = block_partitions
-
- elif row_partitions is not None:
- self._row_partitions = row_partitions
-
- elif col_partitions is not None:
- self._col_partitions = col_partitions
-
- if col_metadata is not None:
- self._col_metadata = col_metadata
- else:
- assert columns is not None, \
- "If col_metadata is None, columns must be passed in"
- self._col_metadata = _IndexMetadata(
- self._block_partitions[0, :], index=columns, axis=1)
- if row_metadata is not None:
- self._row_metadata = row_metadata
- else:
- # Index can be None for default index, so we don't check
- self._row_metadata = _IndexMetadata(
- self._block_partitions[:, 0], index=index, axis=0)
-
- # Update dtypes
- self._get_remote_dtypes()
-
- def add_prefix(self, prefix):
- """Add a prefix to each of the column names.
-
- Returns:
- A new DataFrame containing the new column names.
- """
- new_cols = self.columns.map(lambda x: str(prefix) + str(x))
- return DataFrame(block_partitions=self._block_partitions,
- columns=new_cols,
- col_metadata=self._col_metadata,
- row_metadata=self._row_metadata,
- dtypes_cache=self._dtypes_cache)
-
- def add_suffix(self, suffix):
- """Add a suffix to each of the column names.
-
- Returns:
- A new DataFrame containing the new column names.
- """
- new_cols = self.columns.map(lambda x: str(x) + str(suffix))
- return DataFrame(block_partitions=self._block_partitions,
- columns=new_cols,
- col_metadata=self._col_metadata,
- row_metadata=self._row_metadata,
- dtypes_cache=self._dtypes_cache)
-
- def applymap(self, func):
- """Apply a function to a DataFrame elementwise.
-
- Args:
- func (callable): The function to apply.
- """
- if not callable(func):
- raise ValueError(
- "\'{0}\' object is not callable".format(type(func)))
-
- new_block_partitions = np.array([
- _map_partitions(lambda df: df.applymap(func), block)
- for block in self._block_partitions])
-
- return DataFrame(block_partitions=new_block_partitions,
- row_metadata=self._row_metadata,
- col_metadata=self._col_metadata)
-
- def copy(self, deep=True):
- """Creates a shallow copy of the DataFrame.
-
- Returns:
- A new DataFrame pointing to the same partitions as this one.
- """
- return DataFrame(block_partitions=self._block_partitions,
- columns=self.columns,
- index=self.index,
- dtypes_cache=self._dtypes_cache)
-
- def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
- group_keys=True, squeeze=False, **kwargs):
- """Apply a groupby to this DataFrame. See _groupby() remote task.
- Args:
- by: The value to groupby.
- axis: The axis to groupby.
- level: The level of the groupby.
- as_index: Whether or not to store result as index.
- sort: Whether or not to sort the result by the index.
- group_keys: Whether or not to group the keys.
- squeeze: Whether or not to squeeze.
- Returns:
- A new DataFrame resulting from the groupby.
- """
- axis = pandas.DataFrame()._get_axis_number(axis)
- if callable(by):
- by = by(self.index)
- elif isinstance(by, compat.string_types):
- by = self.__getitem__(by).values.tolist()
- elif is_list_like(by):
- if isinstance(by, pandas.Series):
- by = by.values.tolist()
-
- mismatch = len(by) != len(self) if axis == 0 \
- else len(by) != len(self.columns)
-
- if all(obj in self for obj in by) and mismatch:
- raise NotImplementedError(
- "Groupby with lists of columns not yet supported.")
- elif mismatch:
- raise KeyError(next(x for x in by if x not in self))
-
- from .groupby import DataFrameGroupBy
- return DataFrameGroupBy(self, by, axis, level, as_index, sort,
- group_keys, squeeze, **kwargs)
-
- def sum(self, axis=None, skipna=True, level=None, numeric_only=None,
- min_count=1, **kwargs):
- """Perform a sum across the DataFrame.
-
- Args:
- axis (int): The axis to sum on.
- skipna (bool): True to skip NA values, false otherwise.
-
- Returns:
- The sum of the DataFrame.
- """
- def remote_func(df):
- return df.sum(axis=axis, skipna=skipna, level=level,
- numeric_only=numeric_only, min_count=min_count,
- **kwargs)
-
- return self._arithmetic_helper(remote_func, axis, level)
-
- def abs(self):
- """Apply an absolute value function to all numeric columns.
-
- Returns:
- A new DataFrame with the applied absolute value.
- """
- for t in self.dtypes:
- if np.dtype('O') == t:
- # TODO Give a more accurate error to Pandas
- raise TypeError("bad operand type for abs():", "str")
-
- new_block_partitions = np.array([_map_partitions(lambda df: df.abs(),
- block)
- for block in self._block_partitions])
-
- return DataFrame(block_partitions=new_block_partitions,
- columns=self.columns,
- index=self.index,
- dtypes_cache=self._dtypes_cache)
-
- def isin(self, values):
- """Fill a DataFrame with booleans for cells contained in values.
-
- Args:
- values (iterable, DataFrame, Series, or dict): The values to find.
-
- Returns:
- A new DataFrame with booleans representing whether or not a cell
- is in values.
- True: cell is contained in values.
- False: otherwise
- """
- new_block_partitions = np.array([_map_partitions(
- lambda df: df.isin(values), block)
- for block in self._block_partitions])
-
- return DataFrame(block_partitions=new_block_partitions,
- columns=self.columns,
- index=self.index)
-
- def isna(self):
- """Fill a DataFrame with booleans for cells containing NA.
-
- Returns:
- A new DataFrame with booleans representing whether or not a cell
- is NA.
- True: cell contains NA.
- False: otherwise.
- """
- new_block_partitions = np.array([_map_partitions(
- lambda df: df.isna(), block) for block in self._block_partitions])
-
- new_dtypes = pandas.Series([np.dtype("bool")] * len(self.columns),
- index=self.columns)
-
- return DataFrame(block_partitions=new_block_partitions,
- row_metadata=self._row_metadata,
- col_metadata=self._col_metadata,
- dtypes_cache=new_dtypes)
-
- def isnull(self):
- """Fill a DataFrame with booleans for cells containing a null value.
-
- Returns:
- A new DataFrame with booleans representing whether or not a cell
- is null.
- True: cell contains null.
- False: otherwise.
- """
- new_block_partitions = np.array([_map_partitions(
- lambda df: df.isnull(), block)
- for block in self._block_partitions])
-
- new_dtypes = pandas.Series([np.dtype("bool")] * len(self.columns),
- index=self.columns)
-
- return DataFrame(block_partitions=new_block_partitions,
- row_metadata=self._row_metadata,
- col_metadata=self._col_metadata,
- dtypes_cache=new_dtypes)
-
- def keys(self):
- """Get the info axis for the DataFrame.
-
- Returns:
- A pandas Index for this DataFrame.
- """
- # Each partition should have the same index, so we'll use 0's
- return self.columns
-
- def transpose(self, *args, **kwargs):
- """Transpose columns and rows for the DataFrame.
-
- Returns:
- A new DataFrame transposed from this DataFrame.
- """
- new_block_partitions = np.array([_map_partitions(
- lambda df: df.T, block) for block in self._block_partitions])
-
- return DataFrame(block_partitions=new_block_partitions.T,
- columns=self.index,
- index=self.columns)
-
- T = property(transpose)
-
- def dropna(self, axis=0, how='any', thresh=None, subset=None,
- inplace=False):
- """Create a new DataFrame from the removed NA values from this one.
-
- Args:
- axis (int, tuple, or list): The axis to apply the drop.
- how (str): How to drop the NA values.
- 'all': drop the label if all values are NA.
- 'any': drop the label if any values are NA.
- thresh (int): The minimum number of NAs to require.
- subset ([label]): Labels to consider from other axis.
- inplace (bool): Change this DataFrame or return a new DataFrame.
- True: Modify the data for this DataFrame, return None.
- False: Create a new DataFrame and return it.
-
- Returns:
- If inplace is set to True, returns None, otherwise returns a new
- DataFrame with the dropna applied.
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
-
- if is_list_like(axis):
- axis = [pandas.DataFrame()._get_axis_number(ax) for ax in axis]
-
- result = self
- # TODO(kunalgosar): this builds an intermediate dataframe,
- # which does unnecessary computation
- for ax in axis:
- result = result.dropna(
- axis=ax, how=how, thresh=thresh, subset=subset)
- if not inplace:
- return result
-
- self._update_inplace(block_partitions=result._block_partitions,
- columns=result.columns,
- index=result.index)
-
- return None
-
- axis = pandas.DataFrame()._get_axis_number(axis)
-
- if how is not None and how not in ['any', 'all']:
- raise ValueError('invalid how option: %s' % how)
- if how is None and thresh is None:
- raise TypeError('must specify how or thresh')
-
- indices = None
- if subset is not None:
- if axis == 1:
- indices = self.index.get_indexer_for(subset)
- check = indices == -1
- if check.any():
- raise KeyError(list(np.compress(check, subset)))
- else:
- indices = self.columns.get_indexer_for(subset)
- check = indices == -1
- if check.any():
- raise KeyError(list(np.compress(check, subset)))
-
- def dropna_helper(df):
- new_df = df.dropna(axis=axis, how=how, thresh=thresh,
- subset=indices, inplace=False)
-
- if axis == 1:
- new_index = new_df.columns
- new_df.columns = pandas.RangeIndex(0, len(new_df.columns))
- else:
- new_index = new_df.index
- new_df.reset_index(drop=True, inplace=True)
-
- return new_df, new_index
-
- parts = self._col_partitions if axis == 1 else self._row_partitions
- result = [_deploy_func._submit(args=(dropna_helper, df),
- num_return_vals=2) for df in parts]
- new_parts, new_vals = [list(t) for t in zip(*result)]
-
- if axis == 1:
- new_vals = [self._col_metadata.get_global_indices(i, vals)
- for i, vals in enumerate(ray.get(new_vals))]
-
- # This flattens the 2d array to 1d
- new_vals = [i for j in new_vals for i in j]
- new_cols = self.columns[new_vals]
-
- if not inplace:
- return DataFrame(col_partitions=new_parts,
- columns=new_cols,
- index=self.index)
-
- self._update_inplace(col_partitions=new_parts,
- columns=new_cols,
- index=self.index)
-
- else:
- new_vals = [self._row_metadata.get_global_indices(i, vals)
- for i, vals in enumerate(ray.get(new_vals))]
-
- # This flattens the 2d array to 1d
- new_vals = [i for j in new_vals for i in j]
- new_rows = self.index[new_vals]
-
- if not inplace:
- return DataFrame(row_partitions=new_parts,
- index=new_rows,
- columns=self.columns)
-
- self._update_inplace(row_partitions=new_parts,
- index=new_rows,
- columns=self.columns)
-
- return None
-
- def add(self, other, axis='columns', level=None, fill_value=None):
- """Add this DataFrame to another or a scalar/list.
-
- Args:
- other: What to add this this DataFrame.
- axis: The axis to apply addition over. Only applicaable to Series
- or list 'other'.
- level: A level in the multilevel axis to add over.
- fill_value: The value to fill NaN.
-
- Returns:
- A new DataFrame with the applied addition.
- """
- return self._operator_helper(pandas.DataFrame.add, other, axis, level,
- fill_value)
-
- def agg(self, func, axis=0, *args, **kwargs):
- return self.aggregate(func, axis, *args, **kwargs)
-
- def aggregate(self, func, axis=0, *args, **kwargs):
- axis = pandas.DataFrame()._get_axis_number(axis)
-
- result = None
-
- if axis == 0:
- try:
- result = self._aggregate(func, axis=axis, *args, **kwargs)
- except TypeError:
- pass
-
- if result is None:
- kwargs.pop('is_transform', None)
- return self.apply(func, axis=axis, args=args, **kwargs)
-
- return result
-
- def _aggregate(self, arg, *args, **kwargs):
- _axis = kwargs.pop('_axis', None)
- if _axis is None:
- _axis = getattr(self, 'axis', 0)
- kwargs.pop('_level', None)
-
- if isinstance(arg, compat.string_types):
- return self._string_function(arg, *args, **kwargs)
-
- # Dictionaries have complex behavior because they can be renamed here.
- elif isinstance(arg, dict):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
- elif is_list_like(arg):
- return self.apply(arg, axis=_axis, args=args, **kwargs)
- elif callable(arg):
- self._callable_function(arg, _axis, *args, **kwargs)
- else:
- # TODO Make pandas error
- raise ValueError("type {} is not callable".format(type(arg)))
-
- def _string_function(self, func, *args, **kwargs):
- assert isinstance(func, compat.string_types)
-
- f = getattr(self, func, None)
-
- if f is not None:
- if callable(f):
- return f(*args, **kwargs)
-
- assert len(args) == 0
- assert len([kwarg
- for kwarg in kwargs
- if kwarg not in ['axis', '_level']]) == 0
- return f
-
- f = getattr(np, func, None)
- if f is not None:
- raise NotImplementedError("Numpy aggregates not yet supported.")
-
- raise ValueError("{} is an unknown string function".format(func))
-
- def _callable_function(self, func, axis, *args, **kwargs):
- kwargs['axis'] = axis
-
- def agg_helper(df, arg, index, columns, *args, **kwargs):
- df.index = index
- df.columns = columns
- is_transform = kwargs.pop('is_transform', False)
- new_df = df.agg(arg, *args, **kwargs)
-
- is_series = False
- index = None
- columns = None
-
- if isinstance(new_df, pandas.Series):
- is_series = True
- else:
- columns = new_df.columns
- index = new_df.index
- new_df.columns = pandas.RangeIndex(0, len(new_df.columns))
- new_df.reset_index(drop=True, inplace=True)
-
- if is_transform:
- if is_scalar(new_df) or len(new_df) != len(df):
- raise ValueError("transforms cannot produce "
- "aggregated results")
-
- return is_series, new_df, index, columns
-
- if axis == 0:
- index = self.index
- columns = [self._col_metadata.partition_series(i).index
- for i in range(len(self._col_partitions))]
-
- remote_result = \
- [_deploy_func._submit(args=(
- lambda df: agg_helper(df,
- func,
- index,
- cols,
- *args,
- **kwargs),
- part), num_return_vals=4)
- for cols, part in zip(columns, self._col_partitions)]
-
- if axis == 1:
- indexes = [self._row_metadata.partition_series(i).index
- for i in range(len(self._row_partitions))]
- columns = self.columns
-
- remote_result = \
- [_deploy_func._submit(args=(
- lambda df: agg_helper(df,
- func,
- index,
- columns,
- *args,
- **kwargs),
- part), num_return_vals=4)
- for index, part in zip(indexes, self._row_partitions)]
-
- # This magic transposes the list comprehension returned from remote
- is_series, new_parts, index, columns = \
- [list(t) for t in zip(*remote_result)]
-
- # This part is because agg can allow returning a Series or a
- # DataFrame, and we have to determine which here. Shouldn't add
- # too much to latency in either case because the booleans can
- # be returned immediately
- is_series = ray.get(is_series)
- if all(is_series):
- new_series = pandas.concat(ray.get(new_parts), copy=False)
- new_series.index = self.columns if axis == 0 else self.index
- return new_series
- # This error is thrown when some of the partitions return Series and
- # others return DataFrames. We do not allow mixed returns.
- elif any(is_series):
- raise ValueError("no results.")
- # The remaining logic executes when we have only DataFrames in the
- # remote objects. We build a Ray DataFrame from the Pandas partitions.
- elif axis == 0:
- new_index = ray.get(index[0])
- # This does not handle the Multi Index case
- new_columns = ray.get(columns)
- new_columns = new_columns[0].append(new_columns[1:])
-
- return DataFrame(col_partitions=new_parts,
- columns=new_columns,
- index=new_index)
- else:
- new_columns = ray.get(columns[0])
- # This does not handle the Multi Index case
- new_index = ray.get(index)
- new_index = new_index[0].append(new_index[1:])
-
- return DataFrame(row_partitions=new_parts,
- columns=new_columns,
- index=new_index)
-
- def align(self, other, join='outer', axis=None, level=None, copy=True,
- fill_value=None, method=None, limit=None, fill_axis=0,
- broadcast_axis=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def all(self, axis=None, bool_only=None, skipna=None, level=None,
- **kwargs):
- """Return whether all elements are True over requested axis
-
- Note:
- If axis=None or axis=0, this call applies df.all(axis=1)
- to the transpose of df.
- """
- def remote_func(df):
- return df.all(axis=axis, bool_only=bool_only, skipna=skipna,
- level=level, **kwargs)
-
- return self._arithmetic_helper(remote_func, axis, level)
-
- def any(self, axis=None, bool_only=None, skipna=None, level=None,
- **kwargs):
- """Return whether any elements are True over requested axis
-
- Note:
- If axis=None or axis=0, this call applies on the column partitions,
- otherwise operates on row partitions
- """
- def remote_func(df):
- return df.any(axis=axis, bool_only=bool_only, skipna=skipna,
- level=level, **kwargs)
-
- return self._arithmetic_helper(remote_func, axis, level)
-
- def append(self, other, ignore_index=False, verify_integrity=False):
- """Append another DataFrame/list/Series to this one.
-
- Args:
- other: The object to append to this.
- ignore_index: Ignore the index on appending.
- verify_integrity: Verify the integrity of the index on completion.
-
- Returns:
- A new DataFrame containing the concatenated values.
- """
- if isinstance(other, (pandas.Series, dict)):
- if isinstance(other, dict):
- other = pandas.Series(other)
- if other.name is None and not ignore_index:
- raise TypeError('Can only append a Series if ignore_index=True'
- ' or if the Series has a name')
-
- if other.name is None:
- index = None
- else:
- # other must have the same index name as self, otherwise
- # index name will be reset
- index = pandas.Index([other.name], name=self.index.name)
-
- combined_columns = self.columns.tolist() + self.columns.union(
- other.index).difference(self.columns).tolist()
- other = other.reindex(combined_columns, copy=False)
- other = pandas.DataFrame(other.values.reshape((1, len(other))),
- index=index,
- columns=combined_columns)
- other = other._convert(datetime=True, timedelta=True)
- elif isinstance(other, list) and not isinstance(other[0], DataFrame):
- other = pandas.DataFrame(other)
- if (self.columns.get_indexer(other.columns) >= 0).all():
- other = other.loc[:, self.columns]
-
- from .concat import concat
- if isinstance(other, (list, tuple)):
- to_concat = [self] + other
- else:
- to_concat = [self, other]
-
- return concat(to_concat, ignore_index=ignore_index,
- verify_integrity=verify_integrity)
-
- def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
- args=(), **kwds):
- """Apply a function along input axis of DataFrame.
-
- Args:
- func: The function to apply
- axis: The axis over which to apply the func.
- broadcast: Whether or not to broadcast.
- raw: Whether or not to convert to a Series.
- reduce: Whether or not to try to apply reduction procedures.
-
- Returns:
- Series or DataFrame, depending on func.
- """
- axis = pandas.DataFrame()._get_axis_number(axis)
-
- if isinstance(func, compat.string_types):
- if axis == 1:
- kwds['axis'] = axis
- return getattr(self, func)(*args, **kwds)
- elif isinstance(func, dict):
- if axis == 1:
- raise TypeError(
- "(\"'dict' object is not callable\", "
- "'occurred at index {0}'".format(self.index[0]))
- if len(self.columns) != len(set(self.columns)):
- warnings.warn(
- 'duplicate column names not supported with apply().',
- FutureWarning, stacklevel=2)
- has_list = list in map(type, func.values())
- part_ind_tuples = [(self._col_metadata[key], key) for key in func]
-
- if has_list:
- # if input dict has a list, the function to apply must wrap
- # single functions in lists as well to get the desired output
- # format
- result = [_deploy_func.remote(
- lambda df: df.iloc[:, ind].apply(
- func[key] if is_list_like(func[key])
- else [func[key]]),
- self._col_partitions[part])
- for (part, ind), key in part_ind_tuples]
- return pandas.concat(ray.get(result), axis=1, copy=False)
- else:
- result = [_deploy_func.remote(
- lambda df: df.iloc[:, ind].apply(func[key]),
- self._col_partitions[part])
- for (part, ind), key in part_ind_tuples]
- return pandas.Series(ray.get(result), index=func.keys())
-
- elif is_list_like(func):
- if axis == 1:
- raise TypeError(
- "(\"'list' object is not callable\", "
- "'occurred at index {0}'".format(self.index[0]))
- # TODO: some checking on functions that return Series or Dataframe
- new_cols = _map_partitions(lambda df: df.apply(func),
- self._col_partitions)
-
- # resolve function names for the DataFrame index
- new_index = [f_name if isinstance(f_name, compat.string_types)
- else f_name.__name__ for f_name in func]
- return DataFrame(col_partitions=new_cols,
- columns=self.columns,
- index=new_index,
- col_metadata=self._col_metadata)
- elif callable(func):
- return self._callable_function(func, axis=axis, *args, **kwds)
-
- def as_blocks(self, copy=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def as_matrix(self, columns=None):
- """Convert the frame to its Numpy-array representation.
-
- Args:
- columns: If None, return all columns, otherwise,
- returns specified columns.
-
- Returns:
- values: ndarray
- """
- # TODO this is very inefficient, also see __array__
- return to_pandas(self).as_matrix(columns)
-
- def asfreq(self, freq, method=None, how=None, normalize=False,
- fill_value=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def asof(self, where, subset=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def assign(self, **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def astype(self, dtype, copy=True, errors='raise', **kwargs):
- if isinstance(dtype, dict):
- if (not set(dtype.keys()).issubset(set(self.columns)) and
- errors == 'raise'):
- raise KeyError(
- "Only a column name can be used for the key in"
- "a dtype mappings argument.")
- columns = list(dtype.keys())
- col_idx = [(self.columns.get_loc(columns[i]), columns[i])
- if columns[i] in self.columns
- else (columns[i], columns[i])
- for i in range(len(columns))]
- new_dict = {}
- for idx, key in col_idx:
- new_dict[idx] = dtype[key]
- new_rows = _map_partitions(lambda df, dt: df.astype(dtype=dt,
- copy=True,
- errors=errors,
- **kwargs),
- self._row_partitions, new_dict)
- if copy:
- return DataFrame(row_partitions=new_rows,
- columns=self.columns,
- index=self.index)
- self._row_partitions = new_rows
- else:
- new_blocks = [_map_partitions(lambda d: d.astype(dtype=dtype,
- copy=True,
- errors=errors,
- **kwargs),
- block)
- for block in self._block_partitions]
- if copy:
- return DataFrame(block_partitions=new_blocks,
- columns=self.columns,
- index=self.index)
- self._block_partitions = new_blocks
-
- def at_time(self, time, asof=False):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def between_time(self, start_time, end_time, include_start=True,
- include_end=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
- """Synonym for DataFrame.fillna(method='bfill')
- """
- new_df = self.fillna(method='bfill',
- axis=axis,
- limit=limit,
- downcast=downcast,
- inplace=inplace)
- if not inplace:
- return new_df
-
- def bool(self):
- """Return the bool of a single element PandasObject.
-
- This must be a boolean scalar value, either True or False. Raise a
- ValueError if the PandasObject does not have exactly 1 element, or that
- element is not boolean
- """
- shape = self.shape
- if shape != (1,) and shape != (1, 1):
- raise ValueError("""The PandasObject does not have exactly
- 1 element. Return the bool of a single
- element PandasObject. The truth value is
- ambiguous. Use a.empty, a.item(), a.any()
- or a.all().""")
- else:
- return to_pandas(self).bool()
-
- def boxplot(self, column=None, by=None, ax=None, fontsize=None, rot=0,
- grid=True, figsize=None, layout=None, return_type=None,
- **kwds):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def clip(self, lower=None, upper=None, axis=None, inplace=False, *args,
- **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def clip_lower(self, threshold, axis=None, inplace=False):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def clip_upper(self, threshold, axis=None, inplace=False):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def combine(self, other, func, fill_value=None, overwrite=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def combine_first(self, other):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def compound(self, axis=None, skipna=None, level=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def consolidate(self, inplace=False):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def convert_objects(self, convert_dates=True, convert_numeric=False,
- convert_timedeltas=True, copy=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def corr(self, method='pearson', min_periods=1):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def corrwith(self, other, axis=0, drop=False):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def count(self, axis=0, level=None, numeric_only=False):
- """Get the count of non-null objects in the DataFrame.
-
- Arguments:
- axis: 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
- level: If the axis is a MultiIndex (hierarchical), count along a
- particular level, collapsing into a DataFrame.
- numeric_only: Include only float, int, boolean data
-
- Returns:
- The count, in a Series (or DataFrame if level is specified).
- """
- def remote_func(df):
- return df.count(axis=axis, level=level, numeric_only=numeric_only)
-
- return self._arithmetic_helper(remote_func, axis, level)
-
- def cov(self, min_periods=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def _cumulative_helper(self, func, axis):
- axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \
- else 0
-
- if axis == 0:
- new_cols = _map_partitions(func, self._col_partitions)
- return DataFrame(col_partitions=new_cols,
- row_metadata=self._row_metadata,
- col_metadata=self._col_metadata)
- else:
- new_rows = _map_partitions(func, self._row_partitions)
- return DataFrame(row_partitions=new_rows,
- row_metadata=self._row_metadata,
- col_metadata=self._col_metadata)
-
- def cummax(self, axis=None, skipna=True, *args, **kwargs):
- """Perform a cumulative maximum across the DataFrame.
-
- Args:
- axis (int): The axis to take maximum on.
- skipna (bool): True to skip NA values, false otherwise.
-
- Returns:
- The cumulative maximum of the DataFrame.
- """
- def remote_func(df):
- return df.cummax(axis=axis, skipna=skipna, *args, **kwargs)
-
- return self._cumulative_helper(remote_func, axis)
-
- def cummin(self, axis=None, skipna=True, *args, **kwargs):
- """Perform a cumulative minimum across the DataFrame.
-
- Args:
- axis (int): The axis to cummin on.
- skipna (bool): True to skip NA values, false otherwise.
-
- Returns:
- The cumulative minimum of the DataFrame.
- """
- def remote_func(df):
- return df.cummin(axis=axis, skipna=skipna, *args, **kwargs)
-
- return self._cumulative_helper(remote_func, axis)
-
- def cumprod(self, axis=None, skipna=True, *args, **kwargs):
- """Perform a cumulative product across the DataFrame.
-
- Args:
- axis (int): The axis to take product on.
- skipna (bool): True to skip NA values, false otherwise.
-
- Returns:
- The cumulative product of the DataFrame.
- """
- def remote_func(df):
- return df.cumprod(axis=axis, skipna=skipna, *args, **kwargs)
-
- return self._cumulative_helper(remote_func, axis)
-
- def cumsum(self, axis=None, skipna=True, *args, **kwargs):
- """Perform a cumulative sum across the DataFrame.
-
- Args:
- axis (int): The axis to take sum on.
- skipna (bool): True to skip NA values, false otherwise.
-
- Returns:
- The cumulative sum of the DataFrame.
- """
- def remote_func(df):
- return df.cumsum(axis=axis, skipna=skipna, *args, **kwargs)
-
- return self._cumulative_helper(remote_func, axis)
-
- def describe(self, percentiles=None, include=None, exclude=None):
- """
- Generates descriptive statistics that summarize the central tendency,
- dispersion and shape of a dataset's distribution, excluding NaN values.
-
- Args:
- percentiles (list-like of numbers, optional):
- The percentiles to include in the output.
- include: White-list of data types to include in results
- exclude: Black-list of data types to exclude in results
-
- Returns: Series/DataFrame of summary statistics
- """
- def describe_helper(df):
- """This to ensure nothing goes on with non-numeric columns"""
- try:
- return df.select_dtypes(exclude='object').describe(
- percentiles=percentiles,
- include=include,
- exclude=exclude)
- # This exception is thrown when there are only non-numeric columns
- # in this partition
- except ValueError:
- return pandas.DataFrame()
-
- # Begin fixing index based on the columns inside.
- parts = ray.get(_map_partitions(describe_helper, self._col_partitions))
- # We use the index to get the internal index.
- parts = [(parts[i], i) for i in range(len(parts))]
-
- for df, partition in parts:
- this_partition = self._col_metadata.partition_series(partition)
- df.columns = this_partition[this_partition.isin(df.columns)].index
-
- # Remove index from tuple
- result = pandas.concat([obj[0] for obj in parts], axis=1, copy=False)
- return result
-
- def diff(self, periods=1, axis=0):
- """Finds the difference between elements on the axis requested
-
- Args:
- periods: Periods to shift for forming difference
- axis: Take difference over rows or columns
-
- Returns:
- DataFrame with the diff applied
- """
- axis = pandas.DataFrame()._get_axis_number(axis)
- partitions = (self._col_partitions if
- axis == 0 else self._row_partitions)
-
- result = _map_partitions(lambda df:
- df.diff(axis=axis, periods=periods),
- partitions)
-
- if (axis == 1):
- return DataFrame(row_partitions=result,
- columns=self.columns,
- index=self.index)
- if (axis == 0):
- return DataFrame(col_partitions=result,
- columns=self.columns,
- index=self.index)
-
- def div(self, other, axis='columns', level=None, fill_value=None):
- """Divides this DataFrame against another DataFrame/Series/scalar.
-
- Args:
- other: The object to use to apply the divide against this.
- axis: The axis to divide over.
- level: The Multilevel index level to apply divide over.
- fill_value: The value to fill NaNs with.
-
- Returns:
- A new DataFrame with the Divide applied.
- """
- return self._operator_helper(pandas.DataFrame.div, other, axis, level,
- fill_value)
-
- def divide(self, other, axis='columns', level=None, fill_value=None):
- """Synonym for div.
-
- Args:
- other: The object to use to apply the divide against this.
- axis: The axis to divide over.
- level: The Multilevel index level to apply divide over.
- fill_value: The value to fill NaNs with.
-
- Returns:
- A new DataFrame with the Divide applied.
- """
- return self.div(other, axis, level, fill_value)
-
- def dot(self, other):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def drop(self, labels=None, axis=0, index=None, columns=None, level=None,
- inplace=False, errors='raise'):
- """Return new object with labels in requested axis removed.
- Args:
- labels: Index or column labels to drop.
- axis: Whether to drop labels from the index (0 / 'index') or
- columns (1 / 'columns').
- index, columns: Alternative to specifying axis (labels, axis=1 is
- equivalent to columns=labels).
- level: For MultiIndex
- inplace: If True, do operation inplace and return None.
- errors: If 'ignore', suppress error and existing labels are
- dropped.
- Returns:
- dropped : type of caller
- """
- # TODO implement level
- if level is not None:
- raise NotImplementedError("Level not yet supported for drop")
-
- inplace = validate_bool_kwarg(inplace, "inplace")
- if labels is not None:
- if index is not None or columns is not None:
- raise ValueError("Cannot specify both 'labels' and "
- "'index'/'columns'")
- axis = pandas.DataFrame()._get_axis_name(axis)
- axes = {axis: labels}
- elif index is not None or columns is not None:
- axes, _ = pandas.DataFrame() \
- ._construct_axes_from_arguments((index, columns), {})
- else:
- raise ValueError("Need to specify at least one of 'labels', "
- "'index' or 'columns'")
- obj = self.copy()
-
- def drop_helper(obj, axis, label):
- # TODO(patyang): If you drop from the index first, you can do it
- # in batch by returning the dropped items. Likewise coords.drop
- # leaves the coords df in an inconsistent state.
- if axis == 'index':
- try:
- coords = obj._row_metadata[label]
- if isinstance(coords, pandas.DataFrame):
- partitions = list(coords['partition'])
- indexes = list(coords['index_within_partition'])
- else:
- partitions, indexes = coords
- partitions = [partitions]
- indexes = [indexes]
-
- for part, index in zip(partitions, indexes):
- x = _deploy_func.remote(
- lambda df: df.drop(labels=index, axis=axis,
- errors='ignore'),
- obj._row_partitions[part])
- obj._row_partitions = \
- [obj._row_partitions[i] if i != part
- else x
- for i in range(len(obj._row_partitions))]
-
- # The decrement here is because we're dropping one at a
- # time and the index is automatically updated when we
- # convert back to blocks.
- obj._row_metadata.squeeze(part, index)
-
- obj._row_metadata.drop(labels=label)
- except KeyError:
- return obj
- else:
- try:
- coords = obj._col_metadata[label]
- if isinstance(coords, pandas.DataFrame):
- partitions = list(coords['partition'])
- indexes = list(coords['index_within_partition'])
- else:
- partitions, indexes = coords
- partitions = [partitions]
- indexes = [indexes]
-
- for part, index in zip(partitions, indexes):
- x = _deploy_func.remote(
- lambda df: df.drop(labels=index, axis=axis,
- errors='ignore'),
- obj._col_partitions[part])
- obj._col_partitions = \
- [obj._col_partitions[i] if i != part
- else x
- for i in range(len(obj._col_partitions))]
-
- # The decrement here is because we're dropping one at a
- # time and the index is automatically updated when we
- # convert back to blocks.
- obj._col_metadata.squeeze(part, index)
-
- obj._col_metadata.drop(labels=label)
- except KeyError:
- return obj
-
- return obj
-
- for axis, labels in axes.items():
- if labels is None:
- continue
-
- if is_list_like(labels):
- for label in labels:
- if errors != 'ignore' and label and \
- label not in getattr(self, axis):
- raise ValueError("The label [{}] is not in the [{}]",
- label, axis)
- else:
- obj = drop_helper(obj, axis, label)
- else:
- if errors != 'ignore' and labels and \
- labels not in getattr(self, axis):
- raise ValueError("The label [{}] is not in the [{}]",
- labels, axis)
- else:
- obj = drop_helper(obj, axis, labels)
-
- if not inplace:
- return obj
- else:
- self._row_metadata = obj._row_metadata
- self._col_metadata = obj._col_metadata
- self._block_partitions = obj._block_partitions
-
- def drop_duplicates(self, subset=None, keep='first', inplace=False):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def duplicated(self, subset=None, keep='first'):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def eq(self, other, axis='columns', level=None):
- """Checks element-wise that this is equal to other.
-
- Args:
- other: A DataFrame or Series or scalar to compare to.
- axis: The axis to perform the eq over.
- level: The Multilevel index level to apply eq over.
-
- Returns:
- A new DataFrame filled with Booleans.
- """
- return self._operator_helper(pandas.DataFrame.eq, other, axis, level)
-
- def equals(self, other):
- """
- Checks if other DataFrame is elementwise equal to the current one
-
- Returns:
- Boolean: True if equal, otherwise False
- """
-
- if not self.index.equals(other.index) or not \
- self.columns.equals(other.columns):
- return False
-
- # We copartition because we don't know what the DataFrames look like
- # before this. Empty partitions can give problems with
- # _match_partitioning (See _match_partitioning)
- new_zipped_parts = self._copartition(other, self.index)
-
- equals_partitions = [_equals_helper.remote(left, right)
- for left, right in new_zipped_parts]
-
- # To avoid getting all we use next notation.
- return next((False for eq in equals_partitions if not ray.get(eq)),
- True)
-
- def eval(self, expr, inplace=False, **kwargs):
- """Evaluate a Python expression as a string using various backends.
- Args:
- expr: The expression to evaluate. This string cannot contain any
- Python statements, only Python expressions.
-
- parser: The parser to use to construct the syntax tree from the
- expression. The default of 'pandas' parses code slightly
- different than standard Python. Alternatively, you can parse
- an expression using the 'python' parser to retain strict
- Python semantics. See the enhancing performance documentation
- for more details.
-
- engine: The engine used to evaluate the expression.
-
- truediv: Whether to use true division, like in Python >= 3
-
- local_dict: A dictionary of local variables, taken from locals()
- by default.
-
- global_dict: A dictionary of global variables, taken from
- globals() by default.
-
- resolvers: A list of objects implementing the __getitem__ special
- method that you can use to inject an additional collection
- of namespaces to use for variable lookup. For example, this is
- used in the query() method to inject the index and columns
- variables that refer to their respective DataFrame instance
- attributes.
-
- level: The number of prior stack frames to traverse and add to
- the current scope. Most users will not need to change this
- parameter.
-
- target: This is the target object for assignment. It is used when
- there is variable assignment in the expression. If so, then
- target must support item assignment with string keys, and if a
- copy is being returned, it must also support .copy().
-
- inplace: If target is provided, and the expression mutates target,
- whether to modify target inplace. Otherwise, return a copy of
- target with the mutation.
- Returns:
- ndarray, numeric scalar, DataFrame, Series
- """
- self._validate_eval_query(expr, **kwargs)
-
- columns = self.columns
-
- def eval_helper(df):
- df.columns = columns
- result = df.eval(expr, inplace=False, **kwargs)
- # If result is a series, expr was not an assignment expression.
- if not isinstance(result, pandas.Series):
- result.columns = pandas.RangeIndex(0, len(result.columns))
- return result
-
- inplace = validate_bool_kwarg(inplace, "inplace")
- new_rows = _map_partitions(eval_helper, self._row_partitions)
-
- result_type = ray.get(_deploy_func.remote(lambda df: type(df),
- new_rows[0]))
- if result_type is pandas.Series:
- new_series = pandas.concat(ray.get(new_rows), axis=0, copy=False)
- new_series.index = self.index
- return new_series
-
- columns_copy = self._col_metadata._coord_df.copy().T
- columns_copy.eval(expr, inplace=True, **kwargs)
- columns = columns_copy.columns
-
- if inplace:
- self._update_inplace(row_partitions=new_rows, columns=columns,
- index=self.index)
- else:
- return DataFrame(columns=columns, row_partitions=new_rows)
-
- def ewm(self, com=None, span=None, halflife=None, alpha=None,
- min_periods=0, freq=None, adjust=True, ignore_na=False, axis=0):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def expanding(self, min_periods=1, freq=None, center=False, axis=0):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def ffill(self, axis=None, inplace=False, limit=None, downcast=None):
- """Synonym for DataFrame.fillna(method='ffill')
- """
- new_df = self.fillna(method='ffill',
- axis=axis,
- limit=limit,
- downcast=downcast,
- inplace=inplace)
- if not inplace:
- return new_df
-
- def fillna(self, value=None, method=None, axis=None, inplace=False,
- limit=None, downcast=None, **kwargs):
- """Fill NA/NaN values using the specified method.
-
- Args:
- value: Value to use to fill holes. This value cannot be a list.
-
- method: Method to use for filling holes in reindexed Series pad.
- ffill: propagate last valid observation forward to next valid
- backfill.
- bfill: use NEXT valid observation to fill gap.
-
- axis: 0 or 'index', 1 or 'columns'.
-
- inplace: If True, fill in place. Note: this will modify any other
- views on this object.
-
- limit: If method is specified, this is the maximum number of
- consecutive NaN values to forward/backward fill. In other
- words, if there is a gap with more than this number of
- consecutive NaNs, it will only be partially filled. If method
- is not specified, this is the maximum number of entries along
- the entire axis where NaNs will be filled. Must be greater
- than 0 if not None.
-
- downcast: A dict of item->dtype of what to downcast if possible,
- or the string 'infer' which will try to downcast to an
- appropriate equal type.
-
- Returns:
- filled: DataFrame
- """
- # TODO implement value passed as DataFrame
- if isinstance(value, pandas.DataFrame):
- raise NotImplementedError("Passing a DataFrame as the value for "
- "fillna is not yet supported.")
-
- inplace = validate_bool_kwarg(inplace, 'inplace')
-
- axis = pandas.DataFrame()._get_axis_number(axis) \
- if axis is not None \
- else 0
-
- if isinstance(value, (list, tuple)):
- raise TypeError('"value" parameter must be a scalar or dict, but '
- 'you passed a "{0}"'.format(type(value).__name__))
- if value is None and method is None:
- raise ValueError('must specify a fill method or value')
- if value is not None and method is not None:
- raise ValueError('cannot specify both a fill method and value')
- if method is not None and method not in ['backfill', 'bfill', 'pad',
- 'ffill']:
- expecting = 'pad (ffill) or backfill (bfill)'
- msg = 'Invalid fill method. Expecting {expecting}. Got {method}'\
- .format(expecting=expecting, method=method)
- raise ValueError(msg)
-
- if inplace:
- new_obj = self
- else:
- new_obj = self.copy()
-
- parts, coords_obj = (new_obj._col_partitions,
- new_obj._col_metadata) if axis == 0 else \
- (new_obj._row_partitions,
- new_obj._row_metadata)
-
- if isinstance(value, (pandas.Series, dict)):
- new_vals = {}
- value = dict(value)
- for val in value:
- # Get the local index for the partition
- try:
- part, index = coords_obj[val]
- # Pandas ignores these errors so we will suppress them too.
- except KeyError:
- continue
-
- new_vals[val] = _deploy_func.remote(lambda df: df.fillna(
- value={index: value[val]},
- method=method,
- axis=axis,
- inplace=False,
- limit=limit,
- downcast=downcast,
- **kwargs), parts[part])
-
- # Not every partition was changed, so we put everything back that
- # was not changed and update those that were.
- new_parts = [parts[i] if coords_obj.index[i] not in new_vals
- else new_vals[coords_obj.index[i]]
- for i in range(len(parts))]
- else:
- new_parts = _map_partitions(lambda df: df.fillna(
- value=value,
- method=method,
- axis=axis,
- inplace=False,
- limit=limit,
- downcast=downcast,
- **kwargs), parts)
-
- if axis == 0:
- new_obj._update_inplace(col_partitions=new_parts,
- columns=self.columns,
- index=self.index)
- else:
- new_obj._update_inplace(row_partitions=new_parts,
- columns=self.columns,
- index=self.index)
- if not inplace:
- return new_obj
-
- def filter(self, items=None, like=None, regex=None, axis=None):
- """Subset rows or columns based on their labels
-
- Args:
- items (list): list of labels to subset
- like (string): retain labels where `arg in label == True`
- regex (string): retain labels matching regex input
- axis: axis to filter on
-
- Returns:
- A new dataframe with the filter applied.
- """
- nkw = com._count_not_none(items, like, regex)
- if nkw > 1:
- raise TypeError('Keyword arguments `items`, `like`, or `regex` '
- 'are mutually exclusive')
- if nkw == 0:
- raise TypeError('Must pass either `items`, `like`, or `regex`')
-
- if axis is None:
- axis = 'columns' # This is the default info axis for dataframes
-
- axis = pandas.DataFrame()._get_axis_number(axis)
- labels = self.columns if axis else self.index
-
- if items is not None:
- bool_arr = labels.isin(items)
- elif like is not None:
- def f(x):
- return like in to_str(x)
- bool_arr = labels.map(f).tolist()
- else:
- def f(x):
- return matcher.search(to_str(x)) is not None
- matcher = re.compile(regex)
- bool_arr = labels.map(f).tolist()
-
- if not axis:
- return self[bool_arr]
- return self[self.columns[bool_arr]]
-
- def first(self, offset):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def first_valid_index(self):
- """Return index for first non-NA/null value.
-
- Returns:
- scalar: type of index
- """
- return self._row_metadata.first_valid_index()
-
- def floordiv(self, other, axis='columns', level=None, fill_value=None):
- """Divides this DataFrame against another DataFrame/Series/scalar.
-
- Args:
- other: The object to use to apply the divide against this.
- axis: The axis to divide over.
- level: The Multilevel index level to apply divide over.
- fill_value: The value to fill NaNs with.
-
- Returns:
- A new DataFrame with the Divide applied.
- """
- return self._operator_helper(pandas.DataFrame.floordiv, other, axis,
- level, fill_value)
-
- @classmethod
- def from_csv(self, path, header=0, sep=', ', index_col=0,
- parse_dates=True, encoding=None, tupleize_cols=None,
- infer_datetime_format=False):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- @classmethod
- def from_dict(self, data, orient='columns', dtype=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- @classmethod
- def from_items(self, items, columns=None, orient='columns'):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- @classmethod
- def from_records(self, data, index=None, exclude=None, columns=None,
- coerce_float=False, nrows=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def ge(self, other, axis='columns', level=None):
- """Checks element-wise that this is greater than or equal to other.
-
- Args:
- other: A DataFrame or Series or scalar to compare to.
- axis: The axis to perform the gt over.
- level: The Multilevel index level to apply gt over.
-
- Returns:
- A new DataFrame filled with Booleans.
- """
- return self._operator_helper(pandas.DataFrame.ge, other, axis, level)
-
- def get(self, key, default=None):
- """Get item from object for given key (DataFrame column, Panel
- slice, etc.). Returns default value if not found.
-
- Args:
- key (DataFrame column, Panel slice) : the key for which value
- to get
-
- Returns:
- value (type of items contained in object) : A value that is
- stored at the key
- """
- try:
- return self[key]
- except (KeyError, ValueError, IndexError):
- return default
-
- def get_dtype_counts(self):
- """Get the counts of dtypes in this object.
-
- Returns:
- The counts of dtypes in this object.
- """
- return ray.get(_deploy_func.remote(lambda df: df.get_dtype_counts(),
- self._row_partitions[0]))
-
- def get_ftype_counts(self):
- """Get the counts of ftypes in this object.
-
- Returns:
- The counts of ftypes in this object.
- """
- return ray.get(_deploy_func.remote(lambda df: df.get_ftype_counts(),
- self._row_partitions[0]))
-
- def get_value(self, index, col, takeable=False):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def get_values(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def gt(self, other, axis='columns', level=None):
- """Checks element-wise that this is greater than other.
-
- Args:
- other: A DataFrame or Series or scalar to compare to.
- axis: The axis to perform the gt over.
- level: The Multilevel index level to apply gt over.
-
- Returns:
- A new DataFrame filled with Booleans.
- """
- return self._operator_helper(pandas.DataFrame.gt, other, axis, level)
-
- def head(self, n=5):
- """Get the first n rows of the dataframe.
-
- Args:
- n (int): The number of rows to return.
-
- Returns:
- A new dataframe with the first n rows of the dataframe.
- """
- if n >= len(self._row_metadata):
- return self.copy()
-
- new_dfs = _map_partitions(lambda df: df.head(n),
- self._col_partitions)
-
- index = self._row_metadata.index[:n]
-
- return DataFrame(col_partitions=new_dfs,
- col_metadata=self._col_metadata,
- index=index,
- dtypes_cache=self._dtypes_cache)
-
- def hist(self, data, column=None, by=None, grid=True, xlabelsize=None,
- xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
- sharey=False, figsize=None, layout=None, bins=10, **kwds):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def idxmax(self, axis=0, skipna=True):
- """Get the index of the first occurrence of the max value of the axis.
-
- Args:
- axis (int): Identify the max over the rows (1) or columns (0).
- skipna (bool): Whether or not to skip NA values.
-
- Returns:
- A Series with the index for each maximum value for the axis
- specified.
- """
- if not all(d != np.dtype('O') for d in self.dtypes):
- raise TypeError(
- "reduction operation 'argmax' not allowed for this dtype")
-
- def remote_func(df):
- return df.idxmax(axis=axis, skipna=skipna)
-
- internal_indices = self._arithmetic_helper(remote_func, axis)
- # do this to convert internal indices to correct index
- return internal_indices.apply(lambda x: self.index[x])
-
- def idxmin(self, axis=0, skipna=True):
- """Get the index of the first occurrence of the min value of the axis.
-
- Args:
- axis (int): Identify the min over the rows (1) or columns (0).
- skipna (bool): Whether or not to skip NA values.
-
- Returns:
- A Series with the index for each minimum value for the axis
- specified.
- """
- if not all(d != np.dtype('O') for d in self.dtypes):
- raise TypeError(
- "reduction operation 'argmax' not allowed for this dtype")
-
- def remote_func(df):
- return df.idxmin(axis=axis, skipna=skipna)
-
- internal_indices = self._arithmetic_helper(remote_func, axis)
- # do this to convert internal indices to correct index
- return internal_indices.apply(lambda x: self.index[x])
-
- def infer_objects(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
- null_counts=None):
-
- def info_helper(df):
- output_buffer = io.StringIO()
- df.info(verbose=verbose,
- buf=output_buffer,
- max_cols=max_cols,
- memory_usage=memory_usage,
- null_counts=null_counts)
- return output_buffer.getvalue()
-
- # Combine the per-partition info and split into lines
- result = ''.join(ray.get(_map_partitions(info_helper,
- self._col_partitions)))
- lines = result.split('\n')
-
- # Class denoted in info() output
- class_string = '\n'
-
- # Create the Index info() string by parsing self.index
- index_string = self.index.summary() + '\n'
-
- # A column header is needed in the inf() output
- col_header = 'Data columns (total {0} columns):\n' \
- .format(len(self.columns))
-
- # Parse the per-partition values to get the per-column details
- # Find all the lines in the output that start with integers
- prog = re.compile('^[0-9]+.+')
- col_lines = [prog.match(line) for line in lines]
- cols = [c.group(0) for c in col_lines if c is not None]
- # replace the partition columns names with real column names
- columns = ["{0}\t{1}\n".format(self.columns[i],
- cols[i].split(" ", 1)[1])
- for i in range(len(cols))]
- col_string = ''.join(columns) + '\n'
-
- # A summary of the dtypes in the dataframe
- dtypes_string = "dtypes: "
- for dtype, count in self.dtypes.value_counts().iteritems():
- dtypes_string += "{0}({1}),".format(dtype, count)
- dtypes_string = dtypes_string[:-1] + '\n'
-
- # Compute the memory usage by summing per-partitions return values
- # Parse lines for memory usage number
- prog = re.compile('^memory+.+')
- mems = [prog.match(line) for line in lines]
- mem_vals = [float(re.search(r'\d+', m.group(0)).group())
- for m in mems if m is not None]
-
- memory_string = ""
-
- if len(mem_vals) != 0:
- # Sum memory usage from each partition
- if memory_usage != 'deep':
- memory_string = 'memory usage: {0}+ bytes' \
- .format(sum(mem_vals))
- else:
- memory_string = 'memory usage: {0} bytes'.format(sum(mem_vals))
-
- # Combine all the components of the info() output
- result = ''.join([class_string, index_string, col_header,
- col_string, dtypes_string, memory_string])
-
- # Write to specified output buffer
- if buf:
- buf.write(result)
- else:
- sys.stdout.write(result)
-
- def insert(self, loc, column, value, allow_duplicates=False):
- """Insert column into DataFrame at specified location.
-
- Args:
- loc (int): Insertion index. Must verify 0 <= loc <= len(columns).
- column (hashable object): Label of the inserted column.
- value (int, Series, or array-like): The values to insert.
- allow_duplicates (bool): Whether to allow duplicate column names.
- """
- if not is_list_like(value):
- value = np.full(len(self.index), value)
-
- if len(value) != len(self.index):
- raise ValueError(
- "Length of values does not match length of index")
- if not allow_duplicates and column in self.columns:
- raise ValueError(
- "cannot insert {0}, already exists".format(column))
- if loc > len(self.columns):
- raise IndexError(
- "index {0} is out of bounds for axis 0 with size {1}".format(
- loc, len(self.columns)))
- if loc < 0:
- raise ValueError("unbounded slice")
-
- partition, index_within_partition = \
- self._col_metadata.insert(column, loc)
-
- # Deploy insert function to specific column partition, and replace that
- # column
- def insert_col_part(df):
- if isinstance(value, pandas.Series) and \
- isinstance(value.dtype,
- pandas.core.dtypes.dtypes.DatetimeTZDtype):
- # Need to set index to index of this dtype or inserted values
- # become NaT
- df.index = value
- df.insert(index_within_partition, column,
- value, allow_duplicates)
- df.index = pandas.RangeIndex(0, len(df))
- else:
- df.insert(index_within_partition, column,
- value, allow_duplicates)
- return df
-
- new_obj = _deploy_func.remote(insert_col_part,
- self._col_partitions[partition])
-
- new_cols = [self._col_partitions[i]
- if i != partition
- else new_obj
- for i in range(len(self._col_partitions))]
- new_col_names = self.columns.insert(loc, column)
-
- self._update_inplace(col_partitions=new_cols, columns=new_col_names,
- index=self.index)
-
- def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
- limit_direction='forward', downcast=None, **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def iterrows(self):
- """Iterate over DataFrame rows as (index, Series) pairs.
-
- Note:
- Generators can't be pickled so from the remote function
- we expand the generator into a list before getting it.
- This is not that ideal.
-
- Returns:
- A generator that iterates over the rows of the frame.
- """
- index_iter = (self._row_metadata.partition_series(i).index
- for i in range(len(self._row_partitions)))
-
- def iterrow_helper(part):
- df = ray.get(part)
- df.columns = self.columns
- df.index = next(index_iter)
- return df.iterrows()
-
- partition_iterator = PartitionIterator(self._row_partitions,
- iterrow_helper)
-
- for v in partition_iterator:
- yield v
-
- def items(self):
- """Iterator over (column name, Series) pairs.
-
- Note:
- Generators can't be pickled so from the remote function
- we expand the generator into a list before getting it.
- This is not that ideal.
-
- Returns:
- A generator that iterates over the columns of the frame.
- """
- col_iter = (self._col_metadata.partition_series(i).index
- for i in range(len(self._col_partitions)))
-
- def items_helper(part):
- df = ray.get(part)
- df.columns = next(col_iter)
- df.index = self.index
- return df.items()
-
- partition_iterator = PartitionIterator(self._col_partitions,
- items_helper)
-
- for v in partition_iterator:
- yield v
-
- def iteritems(self):
- """Iterator over (column name, Series) pairs.
-
- Note:
- Returns the same thing as .items()
-
- Returns:
- A generator that iterates over the columns of the frame.
- """
- return self.items()
-
- def itertuples(self, index=True, name='Pandas'):
- """Iterate over DataFrame rows as namedtuples.
-
- Args:
- index (boolean, default True): If True, return the index as the
- first element of the tuple.
- name (string, default "Pandas"): The name of the returned
- namedtuples or None to return regular tuples.
- Note:
- Generators can't be pickled so from the remote function
- we expand the generator into a list before getting it.
- This is not that ideal.
-
- Returns:
- A tuple representing row data. See args for varying tuples.
- """
- index_iter = (self._row_metadata.partition_series(i).index
- for i in range(len(self._row_partitions)))
-
- def itertuples_helper(part):
- df = ray.get(part)
- df.columns = self.columns
- df.index = next(index_iter)
- return df.itertuples(index=index, name=name)
-
- partition_iterator = PartitionIterator(self._row_partitions,
- itertuples_helper)
-
- for v in partition_iterator:
- yield v
-
- def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
- sort=False):
- """Join two or more DataFrames, or a DataFrame with a collection.
-
- Args:
- other: What to join this DataFrame with.
- on: A column name to use from the left for the join.
- how: What type of join to conduct.
- lsuffix: The suffix to add to column names that match on left.
- rsuffix: The suffix to add to column names that match on right.
- sort: Whether or not to sort.
-
- Returns:
- The joined DataFrame.
- """
-
- if on is not None:
- raise NotImplementedError("Not yet.")
-
- if isinstance(other, pandas.Series):
- if other.name is None:
- raise ValueError("Other Series must have a name")
- other = DataFrame({other.name: other})
-
- if isinstance(other, DataFrame):
- if on is not None:
- index = self[on]
- else:
- index = self.index
-
- new_index = index.join(other.index, how=how, sort=sort)
-
- # Joining two empty DataFrames is fast, and error checks for us.
- new_column_labels = pandas.DataFrame(columns=self.columns) \
- .join(pandas.DataFrame(columns=other.columns),
- lsuffix=lsuffix, rsuffix=rsuffix).columns
-
- new_partition_num = max(len(self._block_partitions.T),
- len(other._block_partitions.T))
-
- # Join is a concat once we have shuffled the data internally.
- # We shuffle the data by computing the correct order.
- # Another important thing to note: We set the current self index
- # to the index variable which may be 'on'.
- new_self = np.array([
- _reindex_helper._submit(args=tuple([index, new_index, 1,
- new_partition_num] +
- block.tolist()),
- num_return_vals=new_partition_num)
- for block in self._block_partitions.T])
- new_other = np.array([
- _reindex_helper._submit(args=tuple([other.index, new_index, 1,
- new_partition_num] +
- block.tolist()),
- num_return_vals=new_partition_num)
- for block in other._block_partitions.T])
-
- # Append the blocks together (i.e. concat)
- new_block_parts = np.concatenate((new_self, new_other)).T
-
- # Default index in the case that on is set.
- if on is not None:
- new_index = None
-
- # TODO join the two metadata tables for performance.
- return DataFrame(block_partitions=new_block_parts,
- index=new_index,
- columns=new_column_labels)
- else:
- # This constraint carried over from Pandas.
- if on is not None:
- raise ValueError("Joining multiple DataFrames only supported"
- " for joining on index")
-
- # Joining the empty DataFrames with either index or columns is
- # fast. It gives us proper error checking for the edge cases that
- # would otherwise require a lot more logic.
- new_index = pandas.DataFrame(index=self.index).join(
- [pandas.DataFrame(index=obj.index) for obj in other],
- how=how, sort=sort).index
-
- new_column_labels = pandas.DataFrame(columns=self.columns).join(
- [pandas.DataFrame(columns=obj.columns) for obj in other],
- lsuffix=lsuffix, rsuffix=rsuffix).columns
-
- new_partition_num = max([len(self._block_partitions.T)] +
- [len(obj._block_partitions.T)
- for obj in other])
-
- new_self = np.array([
- _reindex_helper._submit(args=tuple([self.index, new_index, 1,
- new_partition_num] +
- block.tolist()),
- num_return_vals=new_partition_num)
- for block in self._block_partitions.T])
-
- new_others = np.array([_reindex_helper._submit(
- args=tuple([obj.index, new_index, 1, new_partition_num] +
- block.tolist()),
- num_return_vals=new_partition_num
- ) for obj in other for block in obj._block_partitions.T])
-
- # Append the columns together (i.e. concat)
- new_block_parts = np.concatenate((new_self, new_others)).T
-
- # TODO join the two metadata tables for performance.
- return DataFrame(block_partitions=new_block_parts,
- index=new_index,
- columns=new_column_labels)
-
- def kurt(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def last(self, offset):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def last_valid_index(self):
- """Return index for last non-NA/null value.
-
- Returns:
- scalar: type of index
- """
- return self._row_metadata.last_valid_index()
-
- def le(self, other, axis='columns', level=None):
- """Checks element-wise that this is less than or equal to other.
-
- Args:
- other: A DataFrame or Series or scalar to compare to.
- axis: The axis to perform the le over.
- level: The Multilevel index level to apply le over.
-
- Returns:
- A new DataFrame filled with Booleans.
- """
- return self._operator_helper(pandas.DataFrame.le, other, axis, level)
-
- def lookup(self, row_labels, col_labels):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def lt(self, other, axis='columns', level=None):
- """Checks element-wise that this is less than other.
-
- Args:
- other: A DataFrame or Series or scalar to compare to.
- axis: The axis to perform the lt over.
- level: The Multilevel index level to apply lt over.
-
- Returns:
- A new DataFrame filled with Booleans.
- """
- return self._operator_helper(pandas.DataFrame.lt, other, axis, level)
-
- def mad(self, axis=None, skipna=None, level=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None,
- errors='raise', try_cast=False, raise_on_error=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def max(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- """Perform max across the DataFrame.
-
- Args:
- axis (int): The axis to take the max on.
- skipna (bool): True to skip NA values, false otherwise.
-
- Returns:
- The max of the DataFrame.
- """
- def remote_func(df):
- return df.max(axis=axis, skipna=skipna, level=level,
- numeric_only=numeric_only, **kwargs)
-
- return self._arithmetic_helper(remote_func, axis, level)
-
- def mean(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- """Computes mean across the DataFrame.
-
- Args:
- axis (int): The axis to take the mean on.
- skipna (bool): True to skip NA values, false otherwise.
-
- Returns:
- The mean of the DataFrame. (Pandas series)
- """
- def remote_func(df):
- return df.mean(axis=axis, skipna=skipna, level=level,
- numeric_only=numeric_only, **kwargs)
-
- return self._arithmetic_helper(remote_func, axis, level)
-
- def median(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- """Computes median across the DataFrame.
-
- Args:
- axis (int): The axis to take the median on.
- skipna (bool): True to skip NA values, false otherwise.
-
- Returns:
- The median of the DataFrame. (Pandas series)
- """
- def remote_func(df):
- return df.median(axis=axis, skipna=skipna, level=level,
- numeric_only=numeric_only, **kwargs)
-
- return self._arithmetic_helper(remote_func, axis, level)
-
- def melt(self, id_vars=None, value_vars=None, var_name=None,
- value_name='value', col_level=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def memory_usage(self, index=True, deep=False):
-
- def remote_func(df):
- return df.memory_usage(index=False, deep=deep)
-
- result = self._arithmetic_helper(remote_func, axis=0)
-
- result.index = self.columns
- if index:
- index_value = self._row_metadata.index.memory_usage(deep=deep)
- return pandas.Series(index_value, index=['Index']).append(result)
-
- return result
-
- def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
- left_index=False, right_index=False, sort=False,
- suffixes=('_x', '_y'), copy=True, indicator=False,
- validate=None):
- """Database style join, where common columns in "on" are merged.
-
- Args:
- right: The DataFrame to merge against.
- how: What type of join to use.
- on: The common column name(s) to join on. If None, and left_on and
- right_on are also None, will default to all commonly named
- columns.
- left_on: The column(s) on the left to use for the join.
- right_on: The column(s) on the right to use for the join.
- left_index: Use the index from the left as the join keys.
- right_index: Use the index from the right as the join keys.
- sort: Sort the join keys lexicographically in the result.
- suffixes: Add this suffix to the common names not in the "on".
- copy: Does nothing in our implementation
- indicator: Adds a column named _merge to the DataFrame with
- metadata from the merge about each row.
- validate: Checks if merge is a specific type.
-
- Returns:
- A merged Dataframe
- """
-
- if not isinstance(right, DataFrame):
- raise ValueError("can not merge DataFrame with instance of type "
- "{}".format(type(right)))
-
- args = (how, on, left_on, right_on, left_index, right_index, sort,
- suffixes, False, indicator, validate)
-
- left_cols = ray.put(self.columns)
- right_cols = ray.put(right.columns)
-
- # This can be put in a remote function because we don't need it until
- # the end, and the columns can be built asynchronously. This takes the
- # columns defining off the critical path and speeds up the overall
- # merge.
- new_columns = _merge_columns.remote(left_cols, right_cols, *args)
-
- if on is not None:
- if left_on is not None or right_on is not None:
- raise MergeError("Can only pass argument \"on\" OR \"left_on\""
- " and \"right_on\", not a combination of "
- "both.")
- if not is_list_like(on):
- on = [on]
-
- if next((True for key in on if key not in self), False) or \
- next((True for key in on if key not in right), False):
-
- missing_key = \
- next((str(key) for key in on if key not in self), "") + \
- next((str(key) for key in on if key not in right), "")
- raise KeyError(missing_key)
-
- elif right_on is not None or right_index is True:
- if left_on is None and left_index is False:
- # Note: This is not the same error as pandas, but pandas throws
- # a ValueError NoneType has no len(), and I don't think that
- # helps enough.
- raise TypeError("left_on must be specified or left_index must "
- "be true if right_on is specified.")
-
- elif left_on is not None or left_index is True:
- if right_on is None and right_index is False:
- # Note: See note above about TypeError.
- raise TypeError("right_on must be specified or right_index "
- "must be true if right_on is specified.")
-
- if left_on is not None:
- if not is_list_like(left_on):
- left_on = [left_on]
-
- if next((True for key in left_on if key not in self), False):
- raise KeyError(next(key for key in left_on
- if key not in self))
-
- if right_on is not None:
- if not is_list_like(right_on):
- right_on = [right_on]
-
- if next((True for key in right_on if key not in right), False):
- raise KeyError(next(key for key in right_on
- if key not in right))
-
- # There's a small chance that our partitions are already perfect, but
- # if it's not, we need to adjust them. We adjust the right against the
- # left because the defaults of merge rely on the order of the left. We
- # have to push the index down here, so if we're joining on the right's
- # index we go ahead and push it down here too.
- if not np.array_equal(self._row_metadata._lengths,
- right._row_metadata._lengths) or right_index:
-
- repartitioned_right = np.array([_match_partitioning._submit(
- args=(df, self._row_metadata._lengths, right.index),
- num_return_vals=len(self._row_metadata._lengths))
- for df in right._col_partitions]).T
- else:
- repartitioned_right = right._block_partitions
-
- if not left_index and not right_index:
- # Passing None to each call specifies that we don't care about the
- # left's index for the join.
- left_idx = itertools.repeat(None)
-
- # We only return the index if we need to update it, and that only
- # happens when either left_index or right_index is True. We will
- # use this value to add the return vals if we are getting an index
- # back.
- return_index = False
- else:
- # We build this to push the index down so that we can use it for
- # the join.
- left_idx = \
- (v.index for k, v in
- self._row_metadata._coord_df.copy().groupby('partition'))
- return_index = True
-
- new_blocks = \
- np.array([_co_op_helper._submit(
- args=tuple([lambda x, y: x.merge(y, *args),
- left_cols, right_cols,
- len(self._block_partitions.T), next(left_idx)] +
- np.concatenate(obj).tolist()),
- num_return_vals=len(self._block_partitions.T) + return_index)
- for obj in zip(self._block_partitions,
- repartitioned_right)])
-
- if not return_index:
- # Default to RangeIndex if left_index and right_index both false.
- new_index = None
- else:
- new_index_parts = new_blocks[:, -1]
- new_index = _concat_index.remote(*new_index_parts)
- new_blocks = new_blocks[:, :-1]
-
- return DataFrame(block_partitions=new_blocks,
- columns=new_columns,
- index=new_index)
-
- def min(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- """Perform min across the DataFrame.
-
- Args:
- axis (int): The axis to take the min on.
- skipna (bool): True to skip NA values, false otherwise.
-
- Returns:
- The min of the DataFrame.
- """
- def remote_func(df):
- return df.min(axis=axis, skipna=skipna, level=level,
- numeric_only=numeric_only, **kwargs)
-
- return self._arithmetic_helper(remote_func, axis, level)
-
- def mod(self, other, axis='columns', level=None, fill_value=None):
- """Mods this DataFrame against another DataFrame/Series/scalar.
-
- Args:
- other: The object to use to apply the mod against this.
- axis: The axis to mod over.
- level: The Multilevel index level to apply mod over.
- fill_value: The value to fill NaNs with.
-
- Returns:
- A new DataFrame with the Mod applied.
- """
- return self._operator_helper(pandas.DataFrame.mod, other, axis, level,
- fill_value)
-
- def mode(self, axis=0, numeric_only=False):
- """Perform mode across the DataFrame.
-
- Args:
- axis (int): The axis to take the mode on.
- numeric_only (bool): if True, only apply to numeric columns.
-
- Returns:
- DataFrame: The mode of the DataFrame.
- """
- axis = pandas.DataFrame()._get_axis_number(axis)
-
- def mode_helper(df):
- mode_df = df.mode(axis=axis, numeric_only=numeric_only)
- return mode_df, mode_df.shape[axis]
-
- def fix_length(df, *lengths):
- max_len = max(lengths[0])
- df = df.reindex(pandas.RangeIndex(max_len), axis=axis)
- return df
-
- parts = self._col_partitions if axis == 0 else self._row_partitions
-
- result = [_deploy_func._submit(args=(lambda df: mode_helper(df),
- part), num_return_vals=2)
- for part in parts]
-
- parts, lengths = [list(t) for t in zip(*result)]
-
- parts = [_deploy_func.remote(
- lambda df, *l: fix_length(df, l), part, *lengths)
- for part in parts]
-
- if axis == 0:
- return DataFrame(col_partitions=parts,
- columns=self.columns)
- else:
- return DataFrame(row_partitions=parts,
- index=self.index)
-
- def mul(self, other, axis='columns', level=None, fill_value=None):
- """Multiplies this DataFrame against another DataFrame/Series/scalar.
-
- Args:
- other: The object to use to apply the multiply against this.
- axis: The axis to multiply over.
- level: The Multilevel index level to apply multiply over.
- fill_value: The value to fill NaNs with.
-
- Returns:
- A new DataFrame with the Multiply applied.
- """
- return self._operator_helper(pandas.DataFrame.mul, other, axis, level,
- fill_value)
-
- def multiply(self, other, axis='columns', level=None, fill_value=None):
- """Synonym for mul.
-
- Args:
- other: The object to use to apply the multiply against this.
- axis: The axis to multiply over.
- level: The Multilevel index level to apply multiply over.
- fill_value: The value to fill NaNs with.
-
- Returns:
- A new DataFrame with the Multiply applied.
- """
- return self.mul(other, axis, level, fill_value)
-
- def ne(self, other, axis='columns', level=None):
- """Checks element-wise that this is not equal to other.
-
- Args:
- other: A DataFrame or Series or scalar to compare to.
- axis: The axis to perform the ne over.
- level: The Multilevel index level to apply ne over.
-
- Returns:
- A new DataFrame filled with Booleans.
- """
- return self._operator_helper(pandas.DataFrame.ne, other, axis, level)
-
- def nlargest(self, n, columns, keep='first'):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def notna(self):
- """Perform notna across the DataFrame.
-
- Args:
- None
-
- Returns:
- Boolean DataFrame where value is False if corresponding
- value is NaN, True otherwise
- """
- new_block_partitions = np.array([_map_partitions(
- lambda df: df.notna(), block) for block in self._block_partitions])
-
- new_dtypes = pandas.Series([np.dtype("bool")] * len(self.columns),
- index=self.columns)
-
- return DataFrame(block_partitions=new_block_partitions,
- row_metadata=self._row_metadata,
- col_metadata=self._col_metadata,
- dtypes_cache=new_dtypes)
-
- def notnull(self):
- """Perform notnull across the DataFrame.
-
- Args:
- None
-
- Returns:
- Boolean DataFrame where value is False if corresponding
- value is NaN, True otherwise
- """
- new_block_partitions = np.array([_map_partitions(
- lambda df: df.notnull(), block)
- for block in self._block_partitions])
-
- new_dtypes = pandas.Series([np.dtype("bool")] * len(self.columns),
- index=self.columns)
-
- return DataFrame(block_partitions=new_block_partitions,
- row_metadata=self._row_metadata,
- col_metadata=self._col_metadata,
- dtypes_cache=new_dtypes)
-
- def nsmallest(self, n, columns, keep='first'):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def nunique(self, axis=0, dropna=True):
- """Return Series with number of distinct
- observations over requested axis.
-
- Args:
- axis : {0 or 'index', 1 or 'columns'}, default 0
- dropna : boolean, default True
-
- Returns:
- nunique : Series
- """
- def remote_func(df):
- return df.nunique(axis=axis, dropna=dropna)
-
- return self._arithmetic_helper(remote_func, axis)
-
- def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
- **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def pipe(self, func, *args, **kwargs):
- """Apply func(self, *args, **kwargs)
-
- Args:
- func: function to apply to the df.
- args: positional arguments passed into ``func``.
- kwargs: a dictionary of keyword arguments passed into ``func``.
-
- Returns:
- object: the return type of ``func``.
- """
- return com._pipe(self, func, *args, **kwargs)
-
- def pivot(self, index=None, columns=None, values=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def pivot_table(self, values=None, index=None, columns=None,
- aggfunc='mean', fill_value=None, margins=False,
- dropna=True, margins_name='All'):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def plot(self, x=None, y=None, kind='line', ax=None, subplots=False,
- sharex=None, sharey=False, layout=None, figsize=None,
- use_index=True, title=None, grid=None, legend=True, style=None,
- logx=False, logy=False, loglog=False, xticks=None, yticks=None,
- xlim=None, ylim=None, rot=None, fontsize=None, colormap=None,
- table=False, yerr=None, xerr=None, secondary_y=False,
- sort_columns=False, **kwds):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def pop(self, item):
- """Pops an item from this DataFrame and returns it.
-
- Args:
- item (str): Column label to be popped
-
- Returns:
- A Series containing the popped values. Also modifies this
- DataFrame.
- """
- result = self[item]
- del self[item]
- return result
-
- def pow(self, other, axis='columns', level=None, fill_value=None):
- """Pow this DataFrame against another DataFrame/Series/scalar.
-
- Args:
- other: The object to use to apply the pow against this.
- axis: The axis to pow over.
- level: The Multilevel index level to apply pow over.
- fill_value: The value to fill NaNs with.
-
- Returns:
- A new DataFrame with the Pow applied.
- """
- return self._operator_helper(pandas.DataFrame.pow, other, axis, level,
- fill_value)
-
- def prod(self, axis=None, skipna=None, level=None, numeric_only=None,
- min_count=1, **kwargs):
- """Return the product of the values for the requested axis
-
- Args:
- axis : {index (0), columns (1)}
- skipna : boolean, default True
- level : int or level name, default None
- numeric_only : boolean, default None
- min_count : int, default 1
-
- Returns:
- prod : Series or DataFrame (if level specified)
- """
- def remote_func(df):
- return df.prod(axis=axis, skipna=skipna, level=level,
- numeric_only=numeric_only, min_count=min_count,
- **kwargs)
-
- return self._arithmetic_helper(remote_func, axis, level)
-
- def product(self, axis=None, skipna=None, level=None, numeric_only=None,
- min_count=1, **kwargs):
- """Return the product of the values for the requested axis
-
- Args:
- axis : {index (0), columns (1)}
- skipna : boolean, default True
- level : int or level name, default None
- numeric_only : boolean, default None
- min_count : int, default 1
-
- Returns:
- product : Series or DataFrame (if level specified)
- """
- return self.prod(axis=axis, skipna=skipna, level=level,
- numeric_only=numeric_only, min_count=min_count,
- **kwargs)
-
- def quantile(self, q=0.5, axis=0, numeric_only=True,
- interpolation='linear'):
- """Return values at the given quantile over requested axis,
- a la numpy.percentile.
-
- Args:
- q (float): 0 <= q <= 1, the quantile(s) to compute
- axis (int): 0 or 'index' for row-wise,
- 1 or 'columns' for column-wise
- interpolation: {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
- Specifies which interpolation method to use
-
- Returns:
- quantiles : Series or DataFrame
- If q is an array, a DataFrame will be returned where the
- index is q, the columns are the columns of self, and the
- values are the quantiles.
-
- If q is a float, a Series will be returned where the
- index is the columns of self and the values
- are the quantiles.
- """
-
- def check_bad_dtype(t):
- return t == np.dtype('O') or is_timedelta64_dtype(t)
-
- if not numeric_only:
- # check if there are any object columns
- if all(check_bad_dtype(t) for t in self.dtypes):
- raise TypeError("can't multiply sequence by non-int of type "
- "'float'")
- else:
- if next((True for t in self.dtypes if check_bad_dtype(t)),
- False):
- dtype = next(t for t in self.dtypes if check_bad_dtype(t))
- raise ValueError("Cannot compare type '{}' with type '{}'"
- .format(type(dtype), float))
- else:
- # Normally pandas returns this near the end of the quantile, but we
- # can't afford the overhead of running the entire operation before
- # we error.
- if all(check_bad_dtype(t) for t in self.dtypes):
- raise ValueError("need at least one array to concatenate")
-
- # check that all qs are between 0 and 1
- pandas.DataFrame()._check_percentile(q)
-
- def quantile_helper(df, base_object):
- """Quantile to be run inside each partitoin.
-
- Args:
- df: The DataFrame composing the partition.
- base_object: An empty pandas.Series or pandas.DataFrame
- depending on q.
-
- Returns:
- A new Series or DataFrame depending on q.
- """
- # This if call prevents ValueErrors with object only partitions
- if (numeric_only and
- all(dtype == np.dtype('O') or
- is_timedelta64_dtype(dtype)
- for dtype in df.dtypes)):
- return base_object
- else:
- return df.quantile(q=q, axis=axis, numeric_only=numeric_only,
- interpolation=interpolation)
-
- axis = pandas.DataFrame()._get_axis_number(axis)
-
- if isinstance(q, (pandas.Series, np.ndarray, pandas.Index, list)):
-
- q_index = pandas.Float64Index(q)
-
- if axis == 0:
- new_partitions = _map_partitions(
- lambda df: quantile_helper(df, pandas.DataFrame()),
- self._col_partitions)
-
- # select only correct dtype columns
- new_columns = self.dtypes[self.dtypes.apply(
- lambda x: is_numeric_dtype(x))].index
-
- else:
- new_partitions = _map_partitions(
- lambda df: quantile_helper(df, pandas.DataFrame()),
- self._row_partitions)
- new_columns = self.index
-
- return DataFrame(col_partitions=new_partitions,
- index=q_index,
- columns=new_columns)
-
- else:
- # When q is a single float, we return a Series, so using
- # arithmetic_helper works well here.
- result = self._arithmetic_helper(
- lambda df: quantile_helper(df, pandas.Series()), axis)
- result.name = q
- return result
-
- def query(self, expr, inplace=False, **kwargs):
- """Queries the Dataframe with a boolean expression
-
- Returns:
- A new DataFrame if inplace=False
- """
- self._validate_eval_query(expr, **kwargs)
-
- columns = self.columns
-
- def query_helper(df):
- df = df.copy()
- df.columns = columns
- df.query(expr, inplace=True, **kwargs)
- df.columns = pandas.RangeIndex(0, len(df.columns))
- return df
-
- new_rows = _map_partitions(query_helper,
- self._row_partitions)
-
- if inplace:
- self._update_inplace(row_partitions=new_rows, index=self.index)
- else:
- return DataFrame(row_partitions=new_rows,
- col_metadata=self._col_metadata)
-
- def radd(self, other, axis='columns', level=None, fill_value=None):
- return self.add(other, axis, level, fill_value)
-
- def rank(self, axis=0, method='average', numeric_only=None,
- na_option='keep', ascending=True, pct=False):
-
- """
- Compute numerical data ranks (1 through n) along axis.
- Equal values are assigned a rank that is the [method] of
- the ranks of those values.
-
- Args:
- axis (int): 0 or 'index' for row-wise,
- 1 or 'columns' for column-wise
- interpolation: {'average', 'min', 'max', 'first', 'dense'}
- Specifies which method to use for equal vals
- numeric_only (boolean)
- Include only float, int, boolean data.
- na_option: {'keep', 'top', 'bottom'}
- Specifies how to handle NA options
- ascending (boolean):
- Decedes ranking order
- pct (boolean):
- Computes percentage ranking of data
- Returns:
- A new DataFrame
- """
-
- def rank_helper(df):
- return df.rank(axis=axis, method=method,
- numeric_only=numeric_only,
- na_option=na_option,
- ascending=ascending, pct=pct)
-
- axis = pandas.DataFrame()._get_axis_number(axis)
-
- if (axis == 1):
- new_cols = self.dtypes[self.dtypes.apply(
- lambda x: is_numeric_dtype(x))].index
- result = _map_partitions(rank_helper,
- self._row_partitions)
- return DataFrame(row_partitions=result,
- columns=new_cols,
- index=self.index)
-
- if (axis == 0):
- result = _map_partitions(rank_helper,
- self._col_partitions)
- return DataFrame(col_partitions=result,
- columns=self.columns,
- index=self.index)
-
- def rdiv(self, other, axis='columns', level=None, fill_value=None):
- return self._single_df_op_helper(
- lambda df: df.rdiv(other, axis, level, fill_value),
- other, axis, level)
-
- def reindex(self, labels=None, index=None, columns=None, axis=None,
- method=None, copy=True, level=None, fill_value=np.nan,
- limit=None, tolerance=None):
- if level is not None:
- raise NotImplementedError(
- "Multilevel Index not Implemented. "
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \
- else 0
- if axis == 0 and labels is not None:
- index = labels
- elif labels is not None:
- columns = labels
-
- new_blocks = self._block_partitions
- if index is not None:
- old_index = self.index
- new_blocks = np.array([reindex_helper._submit(
- args=(old_index, index, 1, len(new_blocks), method,
- fill_value, limit, tolerance) + tuple(block.tolist()),
- num_return_vals=len(new_blocks))
- for block in new_blocks.T]).T
- else:
- index = self.index
-
- if columns is not None:
- old_columns = self.columns
- new_blocks = np.array([reindex_helper._submit(
- args=(old_columns, columns, 0, new_blocks.shape[1], method,
- fill_value, limit, tolerance) + tuple(block.tolist()),
- num_return_vals=new_blocks.shape[1])
- for block in new_blocks])
- else:
- columns = self.columns
-
- if copy:
- return DataFrame(block_partitions=new_blocks,
- index=index,
- columns=columns)
-
- self._update_inplace(block_partitions=new_blocks,
- index=index,
- columns=columns)
-
- def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
- limit=None, fill_value=np.nan):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def reindex_like(self, other, method=None, copy=True, limit=None,
- tolerance=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def rename(self, mapper=None, index=None, columns=None, axis=None,
- copy=True, inplace=False, level=None):
- """Alters axes labels.
-
- Args:
- mapper, index, columns: Transformations to apply to the axis's
- values.
- axis: Axis to target with mapper.
- copy: Also copy underlying data.
- inplace: Whether to return a new DataFrame.
- level: Only rename a specific level of a MultiIndex.
-
- Returns:
- If inplace is False, a new DataFrame with the updated axes.
- """
- inplace = validate_bool_kwarg(inplace, 'inplace')
-
- # We have to do this with the args because of how rename handles
- # kwargs. It doesn't ignore None values passed in, so we have to filter
- # them ourselves.
- args = locals()
- kwargs = {k: v for k, v in args.items()
- if v is not None and k != "self"}
- # inplace should always be true because this is just a copy, and we
- # will use the results after.
- kwargs['inplace'] = True
-
- df_to_rename = pandas.DataFrame(index=self.index, columns=self.columns)
- df_to_rename.rename(**kwargs)
-
- if inplace:
- obj = self
- else:
- obj = self.copy()
-
- obj.index = df_to_rename.index
- obj.columns = df_to_rename.columns
-
- if not inplace:
- return obj
-
- def rename_axis(self, mapper, axis=0, copy=True, inplace=False):
- axes_is_columns = axis == 1 or axis == "columns"
- renamed = self if inplace else self.copy()
- if axes_is_columns:
- renamed.columns.name = mapper
- else:
- renamed.index.name = mapper
- if not inplace:
- return renamed
-
- def _set_axis_name(self, name, axis=0, inplace=False):
- """Alter the name or names of the axis.
-
- Args:
- name: Name for the Index, or list of names for the MultiIndex
- axis: 0 or 'index' for the index; 1 or 'columns' for the columns
- inplace: Whether to modify `self` directly or return a copy
-
- Returns:
- Type of caller or None if inplace=True.
- """
- axes_is_columns = axis == 1 or axis == "columns"
- renamed = self if inplace else self.copy()
- if axes_is_columns:
- renamed.columns.set_names(name)
- else:
- renamed.index.set_names(name)
-
- if not inplace:
- return renamed
-
- def reorder_levels(self, order, axis=0):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def replace(self, to_replace=None, value=None, inplace=False, limit=None,
- regex=False, method='pad', axis=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def resample(self, rule, how=None, axis=0, fill_method=None, closed=None,
- label=None, convention='start', kind=None, loffset=None,
- limit=None, base=0, on=None, level=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
- col_fill=''):
- """Reset this index to default and create column from current index.
-
- Args:
- level: Only remove the given levels from the index. Removes all
- levels by default
- drop: Do not try to insert index into dataframe columns. This
- resets the index to the default integer index.
- inplace: Modify the DataFrame in place (do not create a new object)
- col_level : If the columns have multiple levels, determines which
- level the labels are inserted into. By default it is inserted
- into the first level.
- col_fill: If the columns have multiple levels, determines how the
- other levels are named. If None then the index name is
- repeated.
-
- Returns:
- A new DataFrame if inplace is False, None otherwise.
- """
- inplace = validate_bool_kwarg(inplace, 'inplace')
- if inplace:
- new_obj = self
- else:
- new_obj = self.copy()
-
- def _maybe_casted_values(index, labels=None):
- if isinstance(index, pandas.PeriodIndex):
- values = index.asobject.values
- elif isinstance(index, pandas.DatetimeIndex) \
- and index.tz is not None:
- values = index
- else:
- values = index.values
- if values.dtype == np.object_:
- values = lib.maybe_convert_objects(values)
-
- # if we have the labels, extract the values with a mask
- if labels is not None:
- mask = labels == -1
-
- # we can have situations where the whole mask is -1,
- # meaning there is nothing found in labels, so make all nan's
- if mask.all():
- values = np.empty(len(mask))
- values.fill(np.nan)
- else:
- values = values.take(labels)
- if mask.any():
- values, changed = maybe_upcast_putmask(
- values, mask, np.nan)
- return values
-
- # We're building a new default index dataframe for use later.
- new_index = pandas.RangeIndex(len(self))
- if level is not None:
- if not isinstance(level, (tuple, list)):
- level = [level]
- level = [self.index._get_level_number(lev) for lev in level]
- if isinstance(self.index, pandas.MultiIndex):
- if len(level) < self.index.nlevels:
- new_index = self.index.droplevel(level)
-
- if not drop:
- if isinstance(self.index, pandas.MultiIndex):
- names = [n if n is not None else ('level_%d' % i)
- for (i, n) in enumerate(self.index.names)]
- to_insert = lzip(self.index.levels, self.index.labels)
- else:
- default = 'index'
- i = 0
- while default in self:
- default = 'level_{}'.format(i)
- i += 1
-
- names = ([default] if self.index.name is None
- else [self.index.name])
- to_insert = ((self.index, None),)
-
- multi_col = isinstance(self.columns, pandas.MultiIndex)
- for i, (lev, lab) in reversed(list(enumerate(to_insert))):
- if not (level is None or i in level):
- continue
- name = names[i]
- if multi_col:
- col_name = (list(name) if isinstance(name, tuple)
- else [name])
- if col_fill is None:
- if len(col_name) not in (1, self.columns.nlevels):
- raise ValueError("col_fill=None is incompatible "
- "with incomplete column name "
- "{}".format(name))
- col_fill = col_name[0]
-
- lev_num = self.columns._get_level_number(col_level)
- name_lst = [col_fill] * lev_num + col_name
- missing = self.columns.nlevels - len(name_lst)
- name_lst += [col_fill] * missing
- name = tuple(name_lst)
- # to ndarray and maybe infer different dtype
- level_values = _maybe_casted_values(lev, lab)
- new_obj.insert(0, name, level_values)
-
- new_obj.index = new_index
-
- if not inplace:
- return new_obj
-
- def rfloordiv(self, other, axis='columns', level=None, fill_value=None):
- return self._single_df_op_helper(
- lambda df: df.rfloordiv(other, axis, level, fill_value),
- other, axis, level)
-
- def rmod(self, other, axis='columns', level=None, fill_value=None):
- return self._single_df_op_helper(
- lambda df: df.rmod(other, axis, level, fill_value),
- other, axis, level)
-
- def rmul(self, other, axis='columns', level=None, fill_value=None):
- return self.mul(other, axis, level, fill_value)
-
- def rolling(self, window, min_periods=None, freq=None, center=False,
- win_type=None, on=None, axis=0, closed=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def round(self, decimals=0, *args, **kwargs):
- new_block_partitions = np.array([_map_partitions(
- lambda df: df.round(decimals=decimals, *args, **kwargs), block)
- for block in self._block_partitions])
-
- return DataFrame(block_partitions=new_block_partitions,
- row_metadata=self._row_metadata,
- col_metadata=self._col_metadata)
-
- def rpow(self, other, axis='columns', level=None, fill_value=None):
- return self._single_df_op_helper(
- lambda df: df.rpow(other, axis, level, fill_value),
- other, axis, level)
-
- def rsub(self, other, axis='columns', level=None, fill_value=None):
- return self._single_df_op_helper(
- lambda df: df.rsub(other, axis, level, fill_value),
- other, axis, level)
-
- def rtruediv(self, other, axis='columns', level=None, fill_value=None):
- return self._single_df_op_helper(
- lambda df: df.rtruediv(other, axis, level, fill_value),
- other, axis, level)
-
- def sample(self, n=None, frac=None, replace=False, weights=None,
- random_state=None, axis=None):
- """Returns a random sample of items from an axis of object.
-
- Args:
- n: Number of items from axis to return. Cannot be used with frac.
- Default = 1 if frac = None.
- frac: Fraction of axis items to return. Cannot be used with n.
- replace: Sample with or without replacement. Default = False.
- weights: Default 'None' results in equal probability weighting.
- If passed a Series, will align with target object on index.
- Index values in weights not found in sampled object will be
- ignored and index values in sampled object not in weights will
- be assigned weights of zero. If called on a DataFrame, will
- accept the name of a column when axis = 0. Unless weights are
- a Series, weights must be same length as axis being sampled.
- If weights do not sum to 1, they will be normalized to sum
- to 1. Missing values in the weights column will be treated as
- zero. inf and -inf values not allowed.
- random_state: Seed for the random number generator (if int), or
- numpy RandomState object.
- axis: Axis to sample. Accepts axis number or name.
-
- Returns:
- A new Dataframe
- """
-
- axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \
- else 0
-
- if axis == 0:
- axis_length = len(self._row_metadata)
- else:
- axis_length = len(self._col_metadata)
-
- if weights is not None:
-
- # Index of the weights Series should correspond to the index of the
- # Dataframe in order to sample
- if isinstance(weights, pandas.Series):
- weights = weights.reindex(self.axes[axis])
-
- # If weights arg is a string, the weights used for sampling will
- # the be values in the column corresponding to that string
- if isinstance(weights, string_types):
- if axis == 0:
- try:
- weights = self[weights]
- except KeyError:
- raise KeyError("String passed to weights not a "
- "valid column")
- else:
- raise ValueError("Strings can only be passed to "
- "weights when sampling from rows on "
- "a DataFrame")
-
- weights = pandas.Series(weights, dtype='float64')
-
- if len(weights) != axis_length:
- raise ValueError("Weights and axis to be sampled must be of "
- "same length")
-
- if (weights == np.inf).any() or (weights == -np.inf).any():
- raise ValueError("weight vector may not include `inf` values")
-
- if (weights < 0).any():
- raise ValueError("weight vector many not include negative "
- "values")
-
- # weights cannot be NaN when sampling, so we must set all nan
- # values to 0
- weights = weights.fillna(0)
-
- # If passed in weights are not equal to 1, renormalize them
- # otherwise numpy sampling function will error
- weights_sum = weights.sum()
- if weights_sum != 1:
- if weights_sum != 0:
- weights = weights / weights_sum
- else:
- raise ValueError("Invalid weights: weights sum to zero")
-
- weights = weights.values
-
- if n is None and frac is None:
- # default to n = 1 if n and frac are both None (in accordance with
- # Pandas specification)
- n = 1
- elif n is not None and frac is None and n % 1 != 0:
- # n must be an integer
- raise ValueError("Only integers accepted as `n` values")
- elif n is None and frac is not None:
- # compute the number of samples based on frac
- n = int(round(frac * axis_length))
- elif n is not None and frac is not None:
- # Pandas specification does not allow both n and frac to be passed
- # in
- raise ValueError('Please enter a value for `frac` OR `n`, not '
- 'both')
- if n < 0:
- raise ValueError("A negative number of rows requested. Please "
- "provide positive value.")
-
- if n == 0:
- # An Empty DataFrame is returned if the number of samples is 0.
- # The Empty Dataframe should have either columns or index specified
- # depending on which axis is passed in.
- return DataFrame(columns=[] if axis == 1 else self.columns,
- index=self.index if axis == 1 else [])
-
- if axis == 1:
- axis_labels = self.columns
- partition_metadata = self._col_metadata
- partitions = self._col_partitions
- else:
- axis_labels = self.index
- partition_metadata = self._row_metadata
- partitions = self._row_partitions
-
- if random_state is not None:
- # Get a random number generator depending on the type of
- # random_state that is passed in
- if isinstance(random_state, int):
- random_num_gen = np.random.RandomState(random_state)
- elif isinstance(random_state, np.random.randomState):
- random_num_gen = random_state
- else:
- # random_state must be an int or a numpy RandomState object
- raise ValueError("Please enter an `int` OR a "
- "np.random.RandomState for random_state")
-
- # choose random numbers and then get corresponding labels from
- # chosen axis
- sample_indices = random_num_gen.randint(
- low=0,
- high=len(partition_metadata),
- size=n)
- samples = axis_labels[sample_indices]
- else:
- # randomly select labels from chosen axis
- samples = np.random.choice(a=axis_labels, size=n,
- replace=replace, p=weights)
-
- # create an array of (partition, index_within_partition) tuples for
- # each sample
- part_ind_tuples = [partition_metadata[sample]
- for sample in samples]
-
- if axis == 1:
- # tup[0] refers to the partition number and tup[1] is the index
- # within that partition
- new_cols = [_deploy_func.remote(lambda df: df.iloc[:, [tup[1]]],
- partitions[tup[0]]) for tup in part_ind_tuples]
- return DataFrame(col_partitions=new_cols,
- columns=samples,
- index=self.index)
- else:
- new_rows = [_deploy_func.remote(lambda df: df.loc[[tup[1]]],
- partitions[tup[0]]) for tup in part_ind_tuples]
- return DataFrame(row_partitions=new_rows,
- columns=self.columns,
- index=samples)
-
- def select(self, crit, axis=0):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def select_dtypes(self, include=None, exclude=None):
- # Validates arguments for whether both include and exclude are None or
- # if they are disjoint. Also invalidates string dtypes.
- pandas.DataFrame().select_dtypes(include, exclude)
-
- if include and not is_list_like(include):
- include = [include]
- elif not include:
- include = []
-
- if exclude and not is_list_like(exclude):
- exclude = [exclude]
- elif not exclude:
- exclude = []
-
- sel = tuple(map(set, (include, exclude)))
-
- include, exclude = map(
- lambda x: set(map(_get_dtype_from_object, x)), sel)
-
- include_these = pandas.Series(not bool(include), index=self.columns)
- exclude_these = pandas.Series(not bool(exclude), index=self.columns)
-
- def is_dtype_instance_mapper(column, dtype):
- return column, functools.partial(issubclass, dtype.type)
-
- for column, f in itertools.starmap(is_dtype_instance_mapper,
- self.dtypes.iteritems()):
- if include: # checks for the case of empty include or exclude
- include_these[column] = any(map(f, include))
- if exclude:
- exclude_these[column] = not any(map(f, exclude))
-
- dtype_indexer = include_these & exclude_these
- indicate = [i for i in range(len(dtype_indexer.values))
- if not dtype_indexer.values[i]]
- return self.drop(columns=self.columns[indicate], inplace=False)
-
- def sem(self, axis=None, skipna=None, level=None, ddof=1,
- numeric_only=None, **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def set_axis(self, labels, axis=0, inplace=None):
- """Assign desired index to given axis.
-
- Args:
- labels (pandas.Index or list-like): The Index to assign.
- axis (string or int): The axis to reassign.
- inplace (bool): Whether to make these modifications inplace.
-
- Returns:
- If inplace is False, returns a new DataFrame, otherwise None.
- """
- if is_scalar(labels):
- warnings.warn(
- 'set_axis now takes "labels" as first argument, and '
- '"axis" as named parameter. The old form, with "axis" as '
- 'first parameter and \"labels\" as second, is still supported '
- 'but will be deprecated in a future version of pandas.',
- FutureWarning, stacklevel=2)
- labels, axis = axis, labels
-
- if inplace is None:
- warnings.warn(
- 'set_axis currently defaults to operating inplace.\nThis '
- 'will change in a future version of pandas, use '
- 'inplace=True to avoid this warning.',
- FutureWarning, stacklevel=2)
- inplace = True
- if inplace:
- setattr(self, pandas.DataFrame()._get_axis_name(axis), labels)
- else:
- obj = self.copy()
- obj.set_axis(labels, axis=axis, inplace=True)
- return obj
-
- def set_index(self, keys, drop=True, append=False, inplace=False,
- verify_integrity=False):
- """Set the DataFrame index using one or more existing columns.
-
- Args:
- keys: column label or list of column labels / arrays.
- drop (boolean): Delete columns to be used as the new index.
- append (boolean): Whether to append columns to existing index.
- inplace (boolean): Modify the DataFrame in place.
- verify_integrity (boolean): Check the new index for duplicates.
- Otherwise defer the check until necessary. Setting to False
- will improve the performance of this method
-
- Returns:
- If inplace is set to false returns a new DataFrame, otherwise None.
- """
- inplace = validate_bool_kwarg(inplace, 'inplace')
- if not isinstance(keys, list):
- keys = [keys]
-
- if inplace:
- frame = self
- else:
- frame = self.copy()
-
- arrays = []
- names = []
- if append:
- names = [x for x in self.index.names]
- if isinstance(self.index, pandas.MultiIndex):
- for i in range(self.index.nlevels):
- arrays.append(self.index._get_level_values(i))
- else:
- arrays.append(self.index)
-
- to_remove = []
- for col in keys:
- if isinstance(col, pandas.MultiIndex):
- # append all but the last column so we don't have to modify
- # the end of this loop
- for n in range(col.nlevels - 1):
- arrays.append(col._get_level_values(n))
-
- level = col._get_level_values(col.nlevels - 1)
- names.extend(col.names)
- elif isinstance(col, pandas.Series):
- level = col._values
- names.append(col.name)
- elif isinstance(col, pandas.Index):
- level = col
- names.append(col.name)
- elif isinstance(col, (list, np.ndarray, pandas.Index)):
- level = col
- names.append(None)
- else:
- level = frame[col]._values
- names.append(col)
- if drop:
- to_remove.append(col)
- arrays.append(level)
-
- index = _ensure_index_from_sequences(arrays, names)
-
- if verify_integrity and not index.is_unique:
- duplicates = index.get_duplicates()
- raise ValueError('Index has duplicate keys: %s' % duplicates)
-
- for c in to_remove:
- del frame[c]
-
- # clear up memory usage
- index._cleanup()
-
- frame.index = index
-
- if not inplace:
- return frame
-
- def set_value(self, index, col, value, takeable=False):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def shift(self, periods=1, freq=None, axis=0):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def skew(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- """Return unbiased skew over requested axis Normalized by N-1
-
- Args:
- axis : {index (0), columns (1)}
- skipna : boolean, default True
- Exclude NA/null values when computing the result.
- level : int or level name, default None
- numeric_only : boolean, default None
-
- Returns:
- skew : Series or DataFrame (if level specified)
- """
- def remote_func(df):
- return df.skew(axis=axis, skipna=skipna, level=level,
- numeric_only=numeric_only, **kwargs)
-
- return self._arithmetic_helper(remote_func, axis, level)
-
- def slice_shift(self, periods=1, axis=0):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
- kind='quicksort', na_position='last', sort_remaining=True,
- by=None):
- """Sort a DataFrame by one of the indices (columns or index).
-
- Args:
- axis: The axis to sort over.
- level: The MultiIndex level to sort over.
- ascending: Ascending or descending
- inplace: Whether or not to update this DataFrame inplace.
- kind: How to perform the sort.
- na_position: Where to position NA on the sort.
- sort_remaining: On Multilevel Index sort based on all levels.
- by: (Deprecated) argument to pass to sort_values.
-
- Returns:
- A sorted DataFrame
- """
- if level is not None:
- raise NotImplementedError("Multilevel index not yet implemented.")
-
- if by is not None:
- warnings.warn("by argument to sort_index is deprecated, "
- "please use .sort_values(by=...)",
- FutureWarning, stacklevel=2)
- if level is not None:
- raise ValueError("unable to simultaneously sort by and level")
- return self.sort_values(by, axis=axis, ascending=ascending,
- inplace=inplace)
-
- axis = pandas.DataFrame()._get_axis_number(axis)
-
- args = (axis, level, ascending, False, kind, na_position,
- sort_remaining)
-
- def _sort_helper(df, index, axis, *args):
- if axis == 0:
- df.index = index
- else:
- df.columns = index
-
- result = df.sort_index(*args)
- df.reset_index(drop=True, inplace=True)
- df.columns = pandas.RangeIndex(len(df.columns))
- return result
-
- if axis == 0:
- index = self.index
- new_column_parts = _map_partitions(
- lambda df: _sort_helper(df, index, axis, *args),
- self._col_partitions)
-
- new_columns = self.columns
- new_index = self.index.sort_values(ascending=ascending)
- new_row_parts = None
- else:
- columns = self.columns
- new_row_parts = _map_partitions(
- lambda df: _sort_helper(df, columns, axis, *args),
- self._row_partitions)
-
- new_columns = self.columns.sort_values(ascending=ascending)
- new_index = self.index
- new_column_parts = None
-
- if not inplace:
- return DataFrame(col_partitions=new_column_parts,
- row_partitions=new_row_parts,
- index=new_index,
- columns=new_columns)
- else:
- self._update_inplace(row_partitions=new_row_parts,
- col_partitions=new_column_parts,
- columns=new_columns,
- index=new_index)
-
- def sort_values(self, by, axis=0, ascending=True, inplace=False,
- kind='quicksort', na_position='last'):
- """Sorts by a column/row or list of columns/rows.
-
- Args:
- by: A list of labels for the axis to sort over.
- axis: The axis to sort.
- ascending: Sort in ascending or descending order.
- inplace: If true, do the operation inplace.
- kind: How to sort.
- na_position: Where to put np.nan values.
-
- Returns:
- A sorted DataFrame.
- """
-
- axis = pandas.DataFrame()._get_axis_number(axis)
-
- if not is_list_like(by):
- by = [by]
-
- if axis == 0:
- broadcast_value_dict = {str(col): self[col] for col in by}
- broadcast_values = pandas.DataFrame(broadcast_value_dict)
- else:
- broadcast_value_list = [to_pandas(self[row::len(self.index)])
- for row in by]
-
- index_builder = list(zip(broadcast_value_list, by))
-
- for row, idx in index_builder:
- row.index = [str(idx)]
-
- broadcast_values = \
- pandas.concat([row for row, idx in index_builder], copy=False)
-
- # We are converting the by to string here so that we don't have a
- # collision with the RangeIndex on the inner frame. It is cheap and
- # gaurantees that we sort by the correct column.
- by = [str(col) for col in by]
-
- args = (by, axis, ascending, False, kind, na_position)
-
- def _sort_helper(df, broadcast_values, axis, *args):
- """Sorts the data on a partition.
-
- Args:
- df: The DataFrame to sort.
- broadcast_values: The by DataFrame to use for the sort.
- axis: The axis to sort over.
- args: The args for the sort.
-
- Returns:
- A new sorted DataFrame.
- """
- if axis == 0:
- broadcast_values.index = df.index
- names = broadcast_values.columns
- else:
- broadcast_values.columns = df.columns
- names = broadcast_values.index
-
- return pandas.concat([df, broadcast_values], axis=axis ^ 1,
- copy=False).sort_values(*args) \
- .drop(names, axis=axis ^ 1)
-
- if axis == 0:
- new_column_partitions = _map_partitions(
- lambda df: _sort_helper(df, broadcast_values, axis, *args),
- self._col_partitions)
-
- new_row_partitions = None
- new_columns = self.columns
-
- # This is important because it allows us to get the axis that we
- # aren't sorting over. We need the order of the columns/rows and
- # this will provide that in the return value.
- new_index = broadcast_values.sort_values(*args).index
- else:
- new_row_partitions = _map_partitions(
- lambda df: _sort_helper(df, broadcast_values, axis, *args),
- self._row_partitions)
-
- new_column_partitions = None
- new_columns = broadcast_values.sort_values(*args).columns
- new_index = self.index
-
- if inplace:
- self._update_inplace(row_partitions=new_row_partitions,
- col_partitions=new_column_partitions,
- columns=new_columns,
- index=new_index)
- else:
- return DataFrame(row_partitions=new_row_partitions,
- col_partitions=new_column_partitions,
- columns=new_columns,
- index=new_index,
- dtypes_cache=self._dtypes_cache)
-
- def sortlevel(self, level=0, axis=0, ascending=True, inplace=False,
- sort_remaining=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def squeeze(self, axis=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def stack(self, level=-1, dropna=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def std(self, axis=None, skipna=None, level=None, ddof=1,
- numeric_only=None, **kwargs):
- """Computes standard deviation across the DataFrame.
-
- Args:
- axis (int): The axis to take the std on.
- skipna (bool): True to skip NA values, false otherwise.
- ddof (int): degrees of freedom
-
- Returns:
- The std of the DataFrame (Pandas Series)
- """
- def remote_func(df):
- return df.std(axis=axis, skipna=skipna, level=level, ddof=ddof,
- numeric_only=numeric_only, **kwargs)
-
- return self._arithmetic_helper(remote_func, axis, level)
-
- def sub(self, other, axis='columns', level=None, fill_value=None):
- """Subtract a DataFrame/Series/scalar from this DataFrame.
-
- Args:
- other: The object to use to apply the subtraction to this.
- axis: THe axis to apply the subtraction over.
- level: Mutlilevel index level to subtract over.
- fill_value: The value to fill NaNs with.
-
- Returns:
- A new DataFrame with the subtraciont applied.
- """
- return self._operator_helper(pandas.DataFrame.sub, other, axis, level,
- fill_value)
-
- def subtract(self, other, axis='columns', level=None, fill_value=None):
- """Alias for sub.
-
- Args:
- other: The object to use to apply the subtraction to this.
- axis: THe axis to apply the subtraction over.
- level: Mutlilevel index level to subtract over.
- fill_value: The value to fill NaNs with.
-
- Returns:
- A new DataFrame with the subtraciont applied.
- """
- return self.sub(other, axis, level, fill_value)
-
- def swapaxes(self, axis1, axis2, copy=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def swaplevel(self, i=-2, j=-1, axis=0):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def tail(self, n=5):
- """Get the last n rows of the dataframe.
-
- Args:
- n (int): The number of rows to return.
-
- Returns:
- A new dataframe with the last n rows of this dataframe.
- """
- if n >= len(self._row_metadata):
- return self
-
- new_dfs = _map_partitions(lambda df: df.tail(n),
- self._col_partitions)
-
- index = self._row_metadata.index[-n:]
- return DataFrame(col_partitions=new_dfs,
- col_metadata=self._col_metadata,
- index=index,
- dtypes_cache=self._dtypes_cache)
-
- def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def to_clipboard(self, excel=None, sep=None, **kwargs):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = to_pandas(self)
- port_frame.to_clipboard(excel, sep, **kwargs)
-
- def to_csv(self, path_or_buf=None, sep=",", na_rep="", float_format=None,
- columns=None, header=True, index=True, index_label=None,
- mode="w", encoding=None, compression=None, quoting=None,
- quotechar='"', line_terminator="\n", chunksize=None,
- tupleize_cols=None, date_format=None, doublequote=True,
- escapechar=None, decimal="."):
-
- kwargs = {
- 'path_or_buf': path_or_buf,
- 'sep': sep,
- 'na_rep': na_rep,
- 'float_format': float_format,
- 'columns': columns,
- 'header': header,
- 'index': index,
- 'index_label': index_label,
- 'mode': mode,
- 'encoding': encoding,
- 'compression': compression,
- 'quoting': quoting,
- 'quotechar': quotechar,
- 'line_terminator': line_terminator,
- 'chunksize': chunksize,
- 'tupleize_cols': tupleize_cols,
- 'date_format': date_format,
- 'doublequote': doublequote,
- 'escapechar': escapechar,
- 'decimal': decimal
- }
-
- if compression is not None:
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
- return to_pandas(self).to_csv(**kwargs)
-
- if tupleize_cols is not None:
- warnings.warn("The 'tupleize_cols' parameter is deprecated and "
- "will be removed in a future version",
- FutureWarning, stacklevel=2)
- else:
- tupleize_cols = False
-
- remote_kwargs_id = ray.put(dict(kwargs, path_or_buf=None))
- columns_id = ray.put(self.columns)
-
- def get_csv_str(df, index, columns, header, kwargs):
- df.index = index
- df.columns = columns
- kwargs["header"] = header
- return df.to_csv(**kwargs)
-
- idxs = [0] + np.cumsum(self._row_metadata._lengths).tolist()
- idx_args = [self.index[idxs[i]:idxs[i + 1]]
- for i in range(len(self._row_partitions))]
- csv_str_ids = _map_partitions(
- get_csv_str, self._row_partitions, idx_args,
- [columns_id] * len(self._row_partitions),
- [header] + [False] * (len(self._row_partitions) - 1),
- [remote_kwargs_id] * len(self._row_partitions))
-
- if path_or_buf is None:
- buf = io.StringIO()
- elif isinstance(path_or_buf, str):
- buf = open(path_or_buf, mode)
- else:
- buf = path_or_buf
-
- for csv_str_id in csv_str_ids:
- buf.write(ray.get(csv_str_id))
- buf.flush()
-
- result = None
- if path_or_buf is None:
- result = buf.getvalue()
- buf.close()
- elif isinstance(path_or_buf, str):
- buf.close()
- return result
-
- def to_dense(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def to_dict(self, orient='dict', into=dict):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
- float_format=None, columns=None, header=True, index=True,
- index_label=None, startrow=0, startcol=0, engine=None,
- merge_cells=True, encoding=None, inf_rep='inf', verbose=True,
- freeze_panes=None):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = to_pandas(self)
- port_frame.to_excel(excel_writer, sheet_name, na_rep,
- float_format, columns, header, index,
- index_label, startrow, startcol, engine,
- merge_cells, encoding, inf_rep, verbose,
- freeze_panes)
-
- def to_feather(self, fname):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = to_pandas(self)
- port_frame.to_feather(fname)
-
- def to_gbq(self, destination_table, project_id, chunksize=10000,
- verbose=True, reauth=False, if_exists='fail',
- private_key=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def to_hdf(self, path_or_buf, key, **kwargs):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = to_pandas(self)
- port_frame.to_hdf(path_or_buf, key, **kwargs)
-
- def to_html(self, buf=None, columns=None, col_space=None, header=True,
- index=True, na_rep='np.NaN', formatters=None,
- float_format=None, sparsify=None, index_names=True,
- justify=None, bold_rows=True, classes=None, escape=True,
- max_rows=None, max_cols=None, show_dimensions=False,
- notebook=False, decimal='.', border=None):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = to_pandas(self)
- port_frame.to_html(buf, columns, col_space, header,
- index, na_rep, formatters,
- float_format, sparsify, index_names,
- justify, bold_rows, classes, escape,
- max_rows, max_cols, show_dimensions,
- notebook, decimal, border)
-
- def to_json(self, path_or_buf=None, orient=None, date_format=None,
- double_precision=10, force_ascii=True, date_unit='ms',
- default_handler=None, lines=False, compression=None):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = to_pandas(self)
- port_frame.to_json(path_or_buf, orient, date_format,
- double_precision, force_ascii, date_unit,
- default_handler, lines, compression)
-
- def to_latex(self, buf=None, columns=None, col_space=None, header=True,
- index=True, na_rep='np.NaN', formatters=None,
- float_format=None, sparsify=None, index_names=True,
- bold_rows=False, column_format=None, longtable=None,
- escape=None, encoding=None, decimal='.', multicolumn=None,
- multicolumn_format=None, multirow=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = to_pandas(self)
- port_frame.to_msgpack(path_or_buf, encoding, **kwargs)
-
- def to_panel(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def to_parquet(self, fname, engine='auto', compression='snappy',
- **kwargs):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = to_pandas(self)
- port_frame.to_parquet(fname, engine, compression, **kwargs)
-
- def to_period(self, freq=None, axis=0, copy=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def to_pickle(self, path, compression='infer',
- protocol=pkl.HIGHEST_PROTOCOL):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = to_pandas(self)
- port_frame.to_pickle(path, compression, protocol)
-
- def to_records(self, index=True, convert_datetime64=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def to_sparse(self, fill_value=None, kind='block'):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
- index=True, index_label=None, chunksize=None, dtype=None):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = to_pandas(self)
- port_frame.to_sql(name, con, flavor, schema, if_exists,
- index, index_label, chunksize, dtype)
-
- def to_stata(self, fname, convert_dates=None, write_index=True,
- encoding='latin-1', byteorder=None, time_stamp=None,
- data_label=None, variable_labels=None):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = to_pandas(self)
- port_frame.to_stata(fname, convert_dates, write_index,
- encoding, byteorder, time_stamp,
- data_label, variable_labels)
-
- def to_string(self, buf=None, columns=None, col_space=None, header=True,
- index=True, na_rep='np.NaN', formatters=None,
- float_format=None, sparsify=None, index_names=True,
- justify=None, line_width=None, max_rows=None, max_cols=None,
- show_dimensions=False):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def to_xarray(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def transform(self, func, *args, **kwargs):
- kwargs["is_transform"] = True
- result = self.agg(func, *args, **kwargs)
- try:
- result.columns = self.columns
- result.index = self.index
- except ValueError:
- raise ValueError("transforms cannot produce aggregated results")
- return result
-
- def truediv(self, other, axis='columns', level=None, fill_value=None):
- """Divides this DataFrame against another DataFrame/Series/scalar.
-
- Args:
- other: The object to use to apply the divide against this.
- axis: The axis to divide over.
- level: The Multilevel index level to apply divide over.
- fill_value: The value to fill NaNs with.
-
- Returns:
- A new DataFrame with the Divide applied.
- """
- return self._operator_helper(pandas.DataFrame.truediv, other, axis,
- level, fill_value)
-
- def truncate(self, before=None, after=None, axis=None, copy=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def tshift(self, periods=1, freq=None, axis=0):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def tz_convert(self, tz, axis=0, level=None, copy=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def tz_localize(self, tz, axis=0, level=None, copy=True,
- ambiguous='raise'):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def unstack(self, level=-1, fill_value=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def update(self, other, join='left', overwrite=True, filter_func=None,
- raise_conflict=False):
- """Modify DataFrame in place using non-NA values from other.
-
- Args:
- other: DataFrame, or object coercible into a DataFrame
- join: {'left'}, default 'left'
- overwrite: If True then overwrite values for common keys in frame
- filter_func: Can choose to replace values other than NA.
- raise_conflict: If True, will raise an error if the DataFrame and
- other both contain data in the same place.
-
- Returns:
- None
- """
- if raise_conflict:
- raise NotImplementedError(
- "raise_conflict parameter not yet supported. "
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- if not isinstance(other, DataFrame):
- other = DataFrame(other)
-
- def update_helper(x, y):
- x.update(y, join, overwrite, filter_func, False)
- return x
-
- self._inter_df_op_helper(update_helper, other, join, 0, None,
- inplace=True)
-
- def var(self, axis=None, skipna=None, level=None, ddof=1,
- numeric_only=None, **kwargs):
- """Computes variance across the DataFrame.
-
- Args:
- axis (int): The axis to take the variance on.
- skipna (bool): True to skip NA values, false otherwise.
- ddof (int): degrees of freedom
-
- Returns:
- The variance of the DataFrame.
- """
- def remote_func(df):
- return df.var(axis=axis, skipna=skipna, level=level, ddof=ddof,
- numeric_only=numeric_only, **kwargs)
-
- return self._arithmetic_helper(remote_func, axis, level)
-
- def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
- errors='raise', try_cast=False, raise_on_error=None):
- """Replaces values not meeting condition with values in other.
-
- Args:
- cond: A condition to be met, can be callable, array-like or a
- DataFrame.
- other: A value or DataFrame of values to use for setting this.
- inplace: Whether or not to operate inplace.
- axis: The axis to apply over. Only valid when a Series is passed
- as other.
- level: The MultiLevel index level to apply over.
- errors: Whether or not to raise errors. Does nothing in Pandas.
- try_cast: Try to cast the result back to the input type.
- raise_on_error: Whether to raise invalid datatypes (deprecated).
-
- Returns:
- A new DataFrame with the replaced values.
- """
-
- inplace = validate_bool_kwarg(inplace, 'inplace')
-
- if isinstance(other, pandas.Series) and axis is None:
- raise ValueError("Must specify axis=0 or 1")
-
- if level is not None:
- raise NotImplementedError("Multilevel Index not yet supported on "
- "Pandas on Ray.")
-
- axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \
- else 0
-
- cond = cond(self) if callable(cond) else cond
-
- if not isinstance(cond, DataFrame):
- if not hasattr(cond, 'shape'):
- cond = np.asanyarray(cond)
- if cond.shape != self.shape:
- raise ValueError("Array conditional must be same shape as "
- "self")
- cond = DataFrame(cond, index=self.index, columns=self.columns)
-
- zipped_partitions = self._copartition(cond, self.index)
- args = (False, axis, level, errors, try_cast, raise_on_error)
-
- if isinstance(other, DataFrame):
- other_zipped = (v for k, v in self._copartition(other,
- self.index))
-
- new_partitions = [_where_helper.remote(k, v, next(other_zipped),
- self.columns, cond.columns,
- other.columns, *args)
- for k, v in zipped_partitions]
-
- # Series has to be treated specially because we're operating on row
- # partitions from here on.
- elif isinstance(other, pandas.Series):
- if axis == 0:
- # Pandas determines which index to use based on axis.
- other = other.reindex(self.index)
- other.index = pandas.RangeIndex(len(other))
-
- # Since we're working on row partitions, we have to partition
- # the Series based on the partitioning of self (since both
- # self and cond are co-partitioned by self.
- other_builder = []
- for length in self._row_metadata._lengths:
- other_builder.append(other[:length])
- other = other[length:]
- # Resetting the index here ensures that we apply each part
- # to the correct row within the partitions.
- other.index = pandas.RangeIndex(len(other))
-
- other = (obj for obj in other_builder)
-
- new_partitions = [_where_helper.remote(k, v,
- next(other,
- pandas.Series()),
- self.columns,
- cond.columns,
- None, *args)
- for k, v in zipped_partitions]
- else:
- other = other.reindex(self.columns)
- new_partitions = [_where_helper.remote(k, v, other,
- self.columns,
- cond.columns,
- None, *args)
- for k, v in zipped_partitions]
-
- else:
- new_partitions = [_where_helper.remote(k, v, other, self.columns,
- cond.columns, None, *args)
- for k, v in zipped_partitions]
-
- if inplace:
- self._update_inplace(row_partitions=new_partitions,
- row_metadata=self._row_metadata,
- col_metadata=self._col_metadata)
- else:
- return DataFrame(row_partitions=new_partitions,
- row_metadata=self._row_metadata,
- col_metadata=self._col_metadata)
-
- def xs(self, key, axis=0, level=None, drop_level=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __getitem__(self, key):
- """Get the column specified by key for this DataFrame.
-
- Args:
- key : The column name.
-
- Returns:
- A Pandas Series representing the value for the column.
- """
- key = com._apply_if_callable(key, self)
-
- # shortcut if we are an actual column
- is_mi_columns = isinstance(self.columns, pandas.MultiIndex)
- try:
- if key in self.columns and not is_mi_columns:
- return self._getitem_column(key)
- except (KeyError, ValueError, TypeError):
- pass
-
- # see if we can slice the rows
- indexer = self._row_metadata.convert_to_index_sliceable(key)
- if indexer is not None:
- return self._getitem_slice(indexer)
-
- if isinstance(key, (pandas.Series, np.ndarray, pandas.Index, list)):
- return self._getitem_array(key)
- elif isinstance(key, DataFrame):
- raise NotImplementedError("To contribute to Pandas on Ray, please"
- "visit github.com/ray-project/ray.")
- # return self._getitem_frame(key)
- elif is_mi_columns:
- raise NotImplementedError("To contribute to Pandas on Ray, please"
- "visit github.com/ray-project/ray.")
- # return self._getitem_multilevel(key)
- else:
- return self._getitem_column(key)
-
- def _getitem_column(self, key):
- # may result in multiple columns?
- partition = self._col_metadata[key, 'partition']
- result = ray.get(self._getitem_indiv_col(key, partition))
- result.name = key
- result.index = self.index
- return result
-
- def _getitem_array(self, key):
- if com.is_bool_indexer(key):
- if isinstance(key, pandas.Series) and \
- not key.index.equals(self.index):
- warnings.warn("Boolean Series key will be reindexed to match "
- "DataFrame index.", UserWarning, stacklevel=3)
- elif len(key) != len(self.index):
- raise ValueError('Item wrong length {} instead of {}.'.format(
- len(key), len(self.index)))
- key = check_bool_indexer(self.index, key)
-
- new_parts = _map_partitions(lambda df: df[key],
- self._col_partitions)
- columns = self.columns
- index = self.index[key]
-
- return DataFrame(col_partitions=new_parts,
- columns=columns,
- index=index)
- else:
- columns = self._col_metadata[key].index
- column_indices = {item: i for i, item in enumerate(self.columns)}
- indices_for_rows = [column_indices[column] for column in columns]
-
- def get_columns_partition(df):
- result = df.__getitem__(indices_for_rows),
- result.columns = pandas.RangeIndex(0, len(result.columns))
- return result
-
- new_parts = [_deploy_func.remote(
- lambda df: df.__getitem__(indices_for_rows),
- part) for part in self._row_partitions]
-
- index = self.index
-
- return DataFrame(row_partitions=new_parts,
- columns=columns,
- index=index)
-
- def _getitem_indiv_col(self, key, part):
- loc = self._col_metadata[key]
- if isinstance(loc, pandas.Series):
- index = loc[loc['partition'] == part]
- else:
- index = loc[loc['partition'] == part]['index_within_partition']
- return _deploy_func.remote(
- lambda df: df.__getitem__(index),
- self._col_partitions[part])
-
- def _getitem_slice(self, key):
- new_cols = _map_partitions(lambda df: df[key],
- self._col_partitions)
-
- index = self.index[key]
- return DataFrame(col_partitions=new_cols,
- col_metadata=self._col_metadata,
- index=index)
-
- def __getattr__(self, key):
- """After regular attribute access, looks up the name in the columns
-
- Args:
- key (str): Attribute name.
-
- Returns:
- The value of the attribute.
- """
- try:
- return object.__getattribute__(self, key)
- except AttributeError as e:
- if key in self.columns:
- return self[key]
- raise e
-
- def __setitem__(self, key, value):
- if not isinstance(key, str):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
- if key not in self.columns:
- self.insert(loc=len(self.columns), column=key, value=value)
- else:
- loc = self.columns.get_loc(key)
- self.__delitem__(key)
- self.insert(loc=loc, column=key, value=value)
-
- def __len__(self):
- """Gets the length of the dataframe.
-
- Returns:
- Returns an integer length of the dataframe object.
- """
- return len(self._row_metadata)
-
- def __unicode__(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __invert__(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __hash__(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __iter__(self):
- """Iterate over the columns
-
- Returns:
- An Iterator over the columns of the dataframe.
- """
- return iter(self.columns)
-
- def __contains__(self, key):
- """Searches columns for specific key
-
- Args:
- key : The column name
-
- Returns:
- Returns a boolean if the specified key exists as a column name
- """
- return self.columns.__contains__(key)
-
- def __nonzero__(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __bool__(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __abs__(self):
- """Creates a modified DataFrame by taking the absolute value.
-
- Returns:
- A modified DataFrame
- """
- return self.abs()
-
- def __round__(self, decimals=0):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __array__(self, dtype=None):
- # TODO: This is very inefficient and needs fix, also see as_matrix
- return to_pandas(self).__array__(dtype=dtype)
-
- def __array_wrap__(self, result, context=None):
- # TODO: This is very inefficient, see also __array__ and as_matrix
- return to_pandas(self).__array_wrap__(result, context=context)
-
- def __getstate__(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __setstate__(self, state):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __delitem__(self, key):
- """Delete a column by key. `del a[key]` for example.
- Operation happens in place.
-
- Notes: This operation happen on row and column partition
- simultaneously. No rebuild.
- Args:
- key: key to delete
- """
- # Create helper method for deleting column(s) in row partition.
- def del_helper(df, to_delete):
- cols = df.columns[to_delete] # either int or an array of ints
-
- if not is_list_like(cols):
- cols = [cols]
-
- for col in cols:
- df.__delitem__(col)
-
- # Reset the column index to conserve space
- df.columns = pandas.RangeIndex(0, len(df.columns))
- return df
-
- # This structure is used to get the correct index inside the partition.
- del_df = self._col_metadata[key]
-
- # We need to standardize between multiple and single occurrences in the
- # columns. Putting single occurrences in a pandas.DataFrame and
- # transposing results in the same structure as multiple with 'loc'.
- if isinstance(del_df, pandas.Series):
- del_df = pandas.DataFrame(del_df).T
-
- # Cast cols as pandas.Series as duplicate columns mean result may be
- # np.int64 or pandas.Series
- col_parts_to_del = \
- pandas.Series(del_df['partition'].copy()).unique()
- self._col_metadata.drop(key)
-
- for i in col_parts_to_del:
- # Compute the correct index inside the partition to delete.
- to_delete_in_partition = \
- del_df[del_df['partition'] == i]['index_within_partition']
-
- for j in range(self._block_partitions.shape[0]):
- self._block_partitions[j, i] = _deploy_func.remote(
- del_helper, self._block_partitions[j, i],
- to_delete_in_partition)
-
- self._col_metadata.reset_partition_coords(col_parts_to_del)
-
- def __finalize__(self, other, method=None, **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __copy__(self, deep=True):
- """Make a copy using Ray.DataFrame.copy method
-
- Args:
- deep: Boolean, deep copy or not.
- Currently we do not support deep copy.
-
- Returns:
- A Ray DataFrame object.
- """
- return self.copy(deep=deep)
-
- def __deepcopy__(self, memo=None):
- """Make a -deep- copy using Ray.DataFrame.copy method
- This is equivalent to copy(deep=True).
-
- Args:
- memo: No effect. Just to comply with Pandas API.
-
- Returns:
- A Ray DataFrame object.
- """
- return self.copy(deep=True)
-
- def __and__(self, other):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __or__(self, other):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __xor__(self, other):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __lt__(self, other):
- return self.lt(other)
-
- def __le__(self, other):
- return self.le(other)
-
- def __gt__(self, other):
- return self.gt(other)
-
- def __ge__(self, other):
- return self.ge(other)
-
- def __eq__(self, other):
- return self.eq(other)
-
- def __ne__(self, other):
- return self.ne(other)
-
- def __add__(self, other):
- return self.add(other)
-
- def __iadd__(self, other):
- return self.add(other)
-
- def __radd__(self, other, axis="columns", level=None, fill_value=None):
- return self.radd(other, axis, level, fill_value)
-
- def __mul__(self, other):
- return self.mul(other)
-
- def __imul__(self, other):
- return self.mul(other)
-
- def __rmul__(self, other, axis="columns", level=None, fill_value=None):
- return self.rmul(other, axis, level, fill_value)
-
- def __pow__(self, other):
- return self.pow(other)
-
- def __ipow__(self, other):
- return self.pow(other)
-
- def __rpow__(self, other, axis="columns", level=None, fill_value=None):
- return self.rpow(other, axis, level, fill_value)
-
- def __sub__(self, other):
- return self.sub(other)
-
- def __isub__(self, other):
- return self.sub(other)
-
- def __rsub__(self, other, axis="columns", level=None, fill_value=None):
- return self.rsub(other, axis, level, fill_value)
-
- def __floordiv__(self, other):
- return self.floordiv(other)
-
- def __ifloordiv__(self, other):
- return self.floordiv(other)
-
- def __rfloordiv__(self, other, axis="columns", level=None,
- fill_value=None):
- return self.rfloordiv(other, axis, level, fill_value)
-
- def __truediv__(self, other):
- return self.truediv(other)
-
- def __itruediv__(self, other):
- return self.truediv(other)
-
- def __rtruediv__(self, other, axis="columns", level=None, fill_value=None):
- return self.rtruediv(other, axis, level, fill_value)
-
- def __mod__(self, other):
- return self.mod(other)
-
- def __imod__(self, other):
- return self.mod(other)
-
- def __rmod__(self, other, axis="columns", level=None, fill_value=None):
- return self.rmod(other, axis, level, fill_value)
-
- def __div__(self, other, axis="columns", level=None, fill_value=None):
- return self.div(other, axis, level, fill_value)
-
- def __rdiv__(self, other, axis="columns", level=None, fill_value=None):
- return self.rdiv(other, axis, level, fill_value)
-
- def __neg__(self):
- """Computes an element wise negative DataFrame
-
- Returns:
- A modified DataFrame where every element is the negation of before
- """
- for t in self.dtypes:
- if not (is_bool_dtype(t)
- or is_numeric_dtype(t)
- or is_timedelta64_dtype(t)):
- raise TypeError("Unary negative expects numeric dtype, not {}"
- .format(t))
-
- new_block_partitions = np.array([_map_partitions(
- lambda df: df.__neg__(), block)
- for block in self._block_partitions])
-
- return DataFrame(block_partitions=new_block_partitions,
- col_metadata=self._col_metadata,
- row_metadata=self._row_metadata)
-
- def __sizeof__(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- @property
- def __doc__(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- @property
- def blocks(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- @property
- def style(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def iat(self, axis=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- @property
- def loc(self):
- """Purely label-location based indexer for selection by label.
-
- We currently support: single label, list array, slice object
- We do not support: boolean array, callable
- """
- from .indexing import _Loc_Indexer
- return _Loc_Indexer(self)
-
- @property
- def is_copy(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def at(self, axis=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def ix(self, axis=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- @property
- def iloc(self):
- """Purely integer-location based indexing for selection by position.
-
- We currently support: single label, list array, slice object
- We do not support: boolean array, callable
- """
- from .indexing import _iLoc_Indexer
- return _iLoc_Indexer(self)
-
- def _copartition(self, other, new_index):
- """Colocates the values of other with this for certain operations.
-
- NOTE: This method uses the indexes of each DataFrame to order them the
- same. This operation does an implicit shuffling of data and zips
- the two DataFrames together to be operated on.
-
- Args:
- other: The other DataFrame to copartition with.
-
- Returns:
- Two new sets of partitions, copartitioned and zipped.
- """
- # Put in the object store so they aren't serialized each iteration.
- old_self_index = ray.put(self.index)
- new_index = ray.put(new_index)
- old_other_index = ray.put(other.index)
-
- new_num_partitions = max(len(self._block_partitions.T),
- len(other._block_partitions.T))
-
- new_partitions_self = \
- np.array([_reindex_helper._submit(
- args=tuple([old_self_index, new_index, 1,
- new_num_partitions] + block.tolist()),
- num_return_vals=new_num_partitions)
- for block in self._block_partitions.T]).T
-
- new_partitions_other = \
- np.array([_reindex_helper._submit(
- args=tuple([old_other_index, new_index, 1,
- new_num_partitions] + block.tolist()),
- num_return_vals=new_num_partitions)
- for block in other._block_partitions.T]).T
-
- return zip(new_partitions_self, new_partitions_other)
-
- def _operator_helper(self, func, other, axis, level, *args):
- """Helper method for inter-dataframe and scalar operations"""
- if isinstance(other, DataFrame):
- return self._inter_df_op_helper(
- lambda x, y: func(x, y, axis, level, *args),
- other, "outer", axis, level)
- else:
- return self._single_df_op_helper(
- lambda df: func(df, other, axis, level, *args),
- other, axis, level)
-
- def _inter_df_op_helper(self, func, other, how, axis, level,
- inplace=False):
- if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
- axis = pandas.DataFrame()._get_axis_number(axis)
-
- new_column_index = self.columns.join(other.columns, how=how)
- new_index = self.index.join(other.index, how=how)
- copartitions = self._copartition(other, new_index)
-
- new_blocks = \
- np.array([_co_op_helper._submit(
- args=tuple([func, self.columns, other.columns,
- len(part[0]), None] +
- np.concatenate(part).tolist()),
- num_return_vals=len(part[0]))
- for part in copartitions])
-
- if not inplace:
- # TODO join the Index Metadata objects together for performance.
- return DataFrame(block_partitions=new_blocks,
- columns=new_column_index,
- index=new_index)
- else:
- self._update_inplace(block_partitions=new_blocks,
- columns=new_column_index,
- index=new_index)
-
- def _single_df_op_helper(self, func, other, axis, level):
- if level is not None:
- raise NotImplementedError("Multilevel index not yet supported "
- "in Pandas on Ray")
- axis = pandas.DataFrame()._get_axis_number(axis)
-
- if is_list_like(other):
- new_index = self.index
- new_column_index = self.columns
- new_col_metadata = self._col_metadata
- new_row_metadata = self._row_metadata
- new_blocks = None
-
- if axis == 0:
- if len(other) != len(self.index):
- raise ValueError(
- "Unable to coerce to Series, length must be {0}: "
- "given {1}".format(len(self.index), len(other)))
- new_columns = _map_partitions(func, self._col_partitions)
- new_rows = None
- else:
- if len(other) != len(self.columns):
- raise ValueError(
- "Unable to coerce to Series, length must be {0}: "
- "given {1}".format(len(self.columns), len(other)))
- new_rows = _map_partitions(func, self._row_partitions)
- new_columns = None
-
- else:
- new_blocks = np.array([_map_partitions(func, block)
- for block in self._block_partitions])
- new_columns = None
- new_rows = None
- new_index = self.index
- new_column_index = self.columns
- new_col_metadata = self._col_metadata
- new_row_metadata = self._row_metadata
-
- return DataFrame(col_partitions=new_columns,
- row_partitions=new_rows,
- block_partitions=new_blocks,
- index=new_index,
- columns=new_column_index,
- col_metadata=new_col_metadata,
- row_metadata=new_row_metadata)
-
-
-@ray.remote
-def _merge_columns(left_columns, right_columns, *args):
- """Merge two columns to get the correct column names and order.
-
- Args:
- left_columns: The columns on the left side of the merge.
- right_columns: The columns on the right side of the merge.
- args: The arguments for the merge.
-
- Returns:
- The columns for the merge operation.
- """
- return pandas.DataFrame(columns=left_columns, index=[0], dtype='uint8') \
- .merge(pandas.DataFrame(columns=right_columns, index=[0],
- dtype='uint8'), *args).columns
-
-
-@ray.remote
-def _where_helper(left, cond, other, left_columns, cond_columns,
- other_columns, *args):
-
- left = pandas.concat(ray.get(left.tolist()), axis=1, copy=False)
- # We have to reset the index and columns here because we are coming
- # from blocks and the axes are set according to the blocks. We have
- # already correctly copartitioned everything, so there's no
- # correctness problems with doing this.
- left.reset_index(inplace=True, drop=True)
- left.columns = left_columns
-
- cond = pandas.concat(ray.get(cond.tolist()), axis=1, copy=False)
- cond.reset_index(inplace=True, drop=True)
- cond.columns = cond_columns
-
- if isinstance(other, np.ndarray):
- other = pandas.concat(ray.get(other.tolist()), axis=1, copy=False)
- other.reset_index(inplace=True, drop=True)
- other.columns = other_columns
-
- return left.where(cond, other, *args)
-
-
-@ray.remote
-def reindex_helper(old_index, new_index, axis, npartitions, method, fill_value,
- limit, tolerance, *df):
- df = pandas.concat(df, axis=axis ^ 1, copy=False)
- if axis == 1:
- df.index = old_index
- else:
- df.columns = old_index
-
- df = df.reindex(new_index, copy=False, axis=axis ^ 1,
- method=method, fill_value=fill_value,
- limit=limit, tolerance=tolerance)
- return create_blocks_helper(df, npartitions, axis)
-
-
-@ray.remote
-def _equals_helper(left, right):
- right = pandas.concat(ray.get(right.tolist()), axis=1, copy=False)
- left = pandas.concat(ray.get(left.tolist()), axis=1, copy=False)
- # Since we know that the index and columns match, we can just check the
- # values. We can't use np.array_equal here because it doesn't recognize
- # np.nan as equal to another np.nan
- try:
- assert_equal(left.values, right.values)
- except AssertionError:
- return False
- return True
diff --git a/python/ray/dataframe/datetimes.py b/python/ray/dataframe/datetimes.py
deleted file mode 100644
index 48a437ceb..000000000
--- a/python/ray/dataframe/datetimes.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import pandas
-import ray
-
-from .dataframe import DataFrame
-from .utils import _map_partitions
-
-
-def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None,
- box=True, format=None, exact=True, unit=None,
- infer_datetime_format=False, origin='unix'):
- """Convert the arg to datetime format. If not Ray DataFrame, this falls
- back on pandas.
-
- Args:
- errors ('raise' or 'ignore'): If 'ignore', errors are silenced.
- dayfirst (bool): Date format is passed in as day first.
- yearfirst (bool): Date format is passed in as year first.
- utc (bool): retuns a UTC DatetimeIndex if True.
- box (bool): If True, returns a DatetimeIndex.
- format (string): strftime to parse time, eg "%d/%m/%Y".
- exact (bool): If True, require an exact format match.
- unit (string, default 'ns'): unit of the arg.
- infer_datetime_format (bool): Whether or not to infer the format.
- origin (string): Define the reference date.
-
- Returns:
- Type depends on input:
-
- - list-like: DatetimeIndex
- - Series: Series of datetime64 dtype
- - scalar: Timestamp
- """
- if not isinstance(arg, DataFrame):
- return pandas.to_datetime(arg, errors=errors, dayfirst=dayfirst,
- yearfirst=yearfirst, utc=utc, box=box,
- format=format, exact=exact, unit=unit,
- infer_datetime_format=infer_datetime_format,
- origin=origin)
- if errors == 'raise':
- pandas.to_datetime(pandas.DataFrame(columns=arg.columns),
- errors=errors, dayfirst=dayfirst,
- yearfirst=yearfirst, utc=utc, box=box,
- format=format, exact=exact, unit=unit,
- infer_datetime_format=infer_datetime_format,
- origin=origin)
-
- def datetime_helper(df, cols):
- df.columns = cols
- return pandas.to_datetime(df, errors=errors, dayfirst=dayfirst,
- yearfirst=yearfirst, utc=utc, box=box,
- format=format, exact=exact, unit=unit,
- infer_datetime_format=infer_datetime_format,
- origin=origin)
-
- datetime_series = _map_partitions(datetime_helper, arg._row_partitions,
- arg.columns)
- result = pandas.concat(ray.get(datetime_series), copy=False)
- result.index = arg.index
-
- return result
diff --git a/python/ray/dataframe/groupby.py b/python/ray/dataframe/groupby.py
deleted file mode 100644
index 39862d94d..000000000
--- a/python/ray/dataframe/groupby.py
+++ /dev/null
@@ -1,581 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import pandas
-import numpy as np
-import pandas.core.groupby
-from pandas.core.dtypes.common import is_list_like
-import pandas.core.common as com
-
-import ray
-
-from .utils import _inherit_docstrings, _reindex_helper
-from .concat import concat
-from .index_metadata import _IndexMetadata
-
-
-@_inherit_docstrings(pandas.core.groupby.DataFrameGroupBy,
- excluded=[pandas.core.groupby.DataFrameGroupBy,
- pandas.core.groupby.DataFrameGroupBy.__init__])
-class DataFrameGroupBy(object):
-
- def __init__(self, df, by, axis, level, as_index, sort, group_keys,
- squeeze, **kwargs):
-
- self._columns = df.columns
- self._index = df.index
- self._axis = axis
-
- self._row_metadata = df._row_metadata
- self._col_metadata = df._col_metadata
-
- if axis == 0:
- partitions = [column for column in df._block_partitions.T]
- self._index_grouped = \
- pandas.Series(self._index, index=self._index) \
- .groupby(by=by, sort=sort)
- else:
- partitions = [row for row in df._block_partitions]
- self._index_grouped = \
- pandas.Series(self._columns, index=self._columns) \
- .groupby(by=by, sort=sort)
-
- self._keys_and_values = [(k, v)
- for k, v in self._index_grouped]
-
- if len(self) > 1:
- self._grouped_partitions = \
- list(zip(*(groupby._submit(args=(by,
- axis,
- level,
- as_index,
- sort,
- group_keys,
- squeeze)
- + tuple(part.tolist()),
- num_return_vals=len(self))
- for part in partitions)))
- else:
- if axis == 0:
- self._grouped_partitions = [df._col_partitions]
- else:
- self._grouped_partitions = [df._row_partitions]
-
- def __getattr__(self, key):
- """Afer regular attribute access, looks up the name in the columns
-
- Args:
- key (str): Attribute name.
-
- Returns:
- The value of the attribute.
- """
- try:
- return object.__getattribute__(self, key)
- except AttributeError as e:
- if key in self._columns:
- raise NotImplementedError(
- "SeriesGroupBy is not implemented."
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
- raise e
-
- @property
- def _iter(self):
- from .dataframe import DataFrame
-
- if self._axis == 0:
- return [(self._keys_and_values[i][0],
- DataFrame(col_partitions=part,
- columns=self._columns,
- index=self._keys_and_values[i][1].index,
- col_metadata=self._col_metadata))
- for i, part in enumerate(self._grouped_partitions)]
- else:
- return [(self._keys_and_values[i][0],
- DataFrame(row_partitions=part,
- columns=self._keys_and_values[i][1].index,
- index=self._index,
- row_metadata=self._row_metadata))
- for i, part in enumerate(self._grouped_partitions)]
-
- @property
- def ngroups(self):
- return len(self)
-
- def skew(self, **kwargs):
- return self._apply_agg_function(lambda df: df.skew(axis=self._axis,
- **kwargs))
-
- def ffill(self, limit=None):
- return self._apply_df_function(lambda df: df.ffill(axis=self._axis,
- limit=limit))
-
- def sem(self, ddof=1):
- return self._apply_agg_function(lambda df: df.sem(axis=self._axis,
- ddof=ddof))
-
- def mean(self, *args, **kwargs):
- return self._apply_agg_function(lambda df: df.mean(axis=self._axis,
- *args,
- **kwargs))
-
- def any(self):
- return self._apply_agg_function(lambda df: df.any(axis=self._axis))
-
- @property
- def plot(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def ohlc(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __bytes__(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- @property
- def tshift(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- @property
- def groups(self):
- return {k: pandas.Index(v) for k, v in self._keys_and_values}
-
- def min(self, **kwargs):
- return self._apply_agg_function(lambda df: df.min(axis=self._axis,
- **kwargs))
-
- def idxmax(self):
- def idxmax_helper(df, index):
- result = df.idxmax(axis=self._axis)
- result = result.apply(lambda v: index[v])
- return result
-
- results = [idxmax_helper(g[1], i[1])
- for g, i in zip(self._iter, self._index_grouped)]
-
- new_df = concat(results, axis=1)
- if self._axis == 0:
- new_df = new_df.T
- new_df.columns = self._columns
- new_df.index = [k for k, v in self._iter]
- else:
- new_df.columns = [k for k, v in self._iter]
- new_df.index = self._index
- return new_df
-
- @property
- def ndim(self):
- return 2 # ndim is always 2 for DataFrames
-
- def shift(self, periods=1, freq=None, axis=0):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def nth(self, n, dropna=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def cumsum(self, axis=0, *args, **kwargs):
- return self._apply_df_function(lambda df: df.cumsum(axis,
- *args,
- **kwargs))
-
- @property
- def indices(self):
- return dict(self._keys_and_values)
-
- def pct_change(self):
- return self._apply_agg_function(
- lambda df: df.pct_change(axis=self._axis))
-
- def filter(self, func, dropna=True, *args, **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def cummax(self, axis=0, **kwargs):
- return self._apply_df_function(lambda df: df.cummax(axis,
- **kwargs))
-
- def apply(self, func, *args, **kwargs):
- def apply_helper(df):
- return df.apply(func, axis=self._axis, *args, **kwargs)
-
- result = [func(v) for k, v in self._iter]
- if self._axis == 0:
- if isinstance(result[0], pandas.Series):
- # Applied an aggregation function
- new_df = concat(result, axis=1).T
- new_df.columns = self._columns
- new_df.index = [k for k, v in self._iter]
- else:
- new_df = concat(result, axis=self._axis)
- new_df._block_partitions = np.array([_reindex_helper._submit(
- args=tuple([new_df.index, self._index, self._axis ^ 1,
- len(new_df._block_partitions)]
- + block.tolist()),
- num_return_vals=len(new_df._block_partitions))
- for block in new_df._block_partitions.T]).T
- new_df.index = self._index
- new_df._row_metadata = \
- _IndexMetadata(new_df._block_partitions[:, 0],
- index=new_df.index, axis=0)
- else:
- if isinstance(result[0], pandas.Series):
- # Applied an aggregation function
- new_df = concat(result, axis=1)
- new_df.columns = [k for k, v in self._iter]
- new_df.index = self._index
- else:
- new_df = concat(result, axis=self._axis)
- new_df._block_partitions = np.array([_reindex_helper._submit(
- args=tuple([new_df.columns, self._columns, self._axis ^ 1,
- new_df._block_partitions.shape[1]]
- + block.tolist()),
- num_return_vals=new_df._block_partitions.shape[1])
- for block in new_df._block_partitions])
- new_df.columns = self._columns
- new_df._col_metadata = \
- _IndexMetadata(new_df._block_partitions[0, :],
- index=new_df.columns, axis=1)
- return new_df
-
- @property
- def dtypes(self):
- if self._axis == 1:
- raise ValueError("Cannot call dtypes on groupby with axis=1")
- return self._apply_agg_function(lambda df: df.dtypes)
-
- def first(self, **kwargs):
- return self._apply_agg_function(lambda df: df.first(offset=0,
- **kwargs))
-
- def backfill(self, limit=None):
- return self.bfill(limit)
-
- def __getitem__(self, key):
- # This operation requires a SeriesGroupBy Object
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def cummin(self, axis=0, **kwargs):
- return self._apply_df_function(lambda df: df.cummin(axis=axis,
- **kwargs))
-
- def bfill(self, limit=None):
- return self._apply_df_function(lambda df: df.bfill(axis=self._axis,
- limit=limit))
-
- def idxmin(self):
- def idxmin_helper(df, index):
- result = df.idxmin(axis=self._axis)
- result = result.apply(lambda v: index[v])
- return result
-
- results = [idxmin_helper(g[1], i[1])
- for g, i in zip(self._iter, self._index_grouped)]
-
- new_df = concat(results, axis=1)
- if self._axis == 0:
- new_df = new_df.T
- new_df.columns = self._columns
- new_df.index = [k for k, v in self._iter]
- else:
- new_df.columns = [k for k, v in self._iter]
- new_df.index = self._index
- return new_df
-
- def prod(self, **kwargs):
- return self._apply_agg_function(lambda df: df.prod(axis=self._axis,
- **kwargs))
-
- def std(self, ddof=1, *args, **kwargs):
- return self._apply_agg_function(lambda df: df.std(axis=self._axis,
- ddof=ddof,
- *args,
- **kwargs))
-
- def aggregate(self, arg, *args, **kwargs):
- if self._axis != 0:
- # This is not implemented in pandas,
- # so we throw a different message
- raise NotImplementedError("axis other than 0 is not supported")
-
- if is_list_like(arg):
- raise NotImplementedError(
- "This requires Multi-level index to be implemented. "
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
- return self._apply_agg_function(lambda df: df.agg(arg,
- axis=self._axis,
- *args,
- **kwargs))
-
- def last(self, **kwargs):
- return self._apply_df_function(lambda df: df.last(offset=0,
- **kwargs))
-
- def mad(self):
- return self._apply_agg_function(lambda df: df.mad())
-
- def rank(self):
- return self._apply_df_function(lambda df: df.rank(axis=self._axis))
-
- @property
- def corrwith(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def pad(self, limit=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def max(self, **kwargs):
- return self._apply_agg_function(lambda df: df.max(axis=self._axis,
- **kwargs))
-
- def var(self, ddof=1, *args, **kwargs):
- return self._apply_agg_function(lambda df: df.var(ddof=ddof,
- axis=self._axis,
- *args,
- **kwargs))
-
- def get_group(self, name, obj=None):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def __len__(self):
- return len(self._keys_and_values)
-
- def all(self):
- return self._apply_agg_function(lambda df: df.all())
-
- def size(self):
- return self._apply_agg_function(lambda df: df.size)
-
- def sum(self, **kwargs):
- return self._apply_agg_function(lambda df:
- df.sum(axis=self._axis, **kwargs))
-
- def __unicode__(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def describe(self, **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def boxplot(self, grouped, subplots=True, column=None, fontsize=None,
- rot=0, grid=True, ax=None, figsize=None, layout=None, **kwds):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def ngroup(self, ascending=True):
- return self._index_grouped.ngroup(ascending)
-
- def nunique(self, dropna=True):
- return self._apply_agg_function(lambda df: df.nunique(dropna=dropna,
- axis=self._axis))
-
- def resample(self, rule, *args, **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def median(self, **kwargs):
- return self._apply_agg_function(lambda df: df.median(axis=self._axis,
- **kwargs))
-
- def head(self, n=5):
- result = [v.head(n) for k, v in self._iter]
- new_df = concat(result, axis=self._axis)
-
- if self._axis == 0:
- index_head = [v[:n] for k, v in self._keys_and_values]
- flattened_index = {i for j in index_head for i in j}
- sorted_index = [i for i in self._index if i in flattened_index]
- new_df._block_partitions = np.array([_reindex_helper._submit(
- args=tuple([new_df.index, sorted_index, 1,
- len(new_df._block_partitions)] + block.tolist()),
- num_return_vals=len(new_df._block_partitions))
- for block in new_df._block_partitions.T]).T
- new_df.index = sorted_index
- new_df._row_metadata = \
- _IndexMetadata(new_df._block_partitions[:, 0],
- index=new_df.index, axis=0)
-
- return new_df
-
- def cumprod(self, axis=0, *args, **kwargs):
- return self._apply_df_function(lambda df: df.cumprod(axis,
- *args,
- **kwargs))
-
- def __iter__(self):
- return self._iter.__iter__()
-
- def agg(self, arg, *args, **kwargs):
- return self.aggregate(arg, *args, **kwargs)
-
- def cov(self):
- return self._apply_agg_function(lambda df: df.cov())
-
- def transform(self, func, *args, **kwargs):
- return self._apply_df_function(lambda df: df.transform(func,
- *args,
- **kwargs))
-
- def corr(self, **kwargs):
- return self._apply_agg_function(lambda df: df.corr(**kwargs))
-
- def fillna(self, **kwargs):
- return self._apply_df_function(lambda df: df.fillna(axis=self._axis,
- **kwargs))
-
- def count(self, **kwargs):
- return self._apply_agg_function(lambda df: df.count(self._axis,
- **kwargs))
-
- def pipe(self, func, *args, **kwargs):
- return com._pipe(self, func, *args, **kwargs)
-
- def cumcount(self, ascending=True):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def tail(self, n=5):
- result = [v.tail(n) for k, v in self._iter]
- new_df = concat(result, axis=self._axis)
-
- if self._axis == 0:
- index_tail = [v[-n:] for k, v in self._keys_and_values]
- flattened_index = {i for j in index_tail for i in j}
- sorted_index = [i for i in self._index if i in flattened_index]
- new_df._block_partitions = np.array([_reindex_helper._submit(
- args=tuple([new_df.index, sorted_index, 1,
- len(new_df._block_partitions)] + block.tolist()),
- num_return_vals=len(new_df._block_partitions))
- for block in new_df._block_partitions.T]).T
- new_df.index = sorted_index
- new_df._row_metadata = \
- _IndexMetadata(new_df._block_partitions[:, 0],
- index=new_df.index, axis=0)
-
- return new_df
-
- # expanding and rolling are unique cases and need to likely be handled
- # separately. They do not appear to be commonly used.
- def expanding(self, *args, **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def rolling(self, *args, **kwargs):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def hist(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def quantile(self, q=0.5, **kwargs):
- if is_list_like(q):
- raise NotImplementedError(
- "This requires Multi-level index to be implemented. "
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- return self._apply_agg_function(lambda df: df.quantile(q=q,
- axis=self._axis,
- **kwargs))
-
- def diff(self):
- raise NotImplementedError(
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- def take(self, **kwargs):
- return self._apply_df_function(lambda df: df.take(**kwargs))
-
- def _apply_agg_function(self, f):
- assert callable(f), "\'{0}\' object is not callable".format(type(f))
-
- result = [f(v) for k, v in self._iter]
- new_df = concat(result, axis=1)
-
- if self._axis == 0:
- new_df = new_df.T
- new_df.columns = self._columns
- new_df.index = [k for k, v in self._iter]
- else:
- new_df.columns = [k for k, v in self._iter]
- new_df.index = self._index
- return new_df
-
- def _apply_df_function(self, f, concat_axis=None):
- assert callable(f), "\'{0}\' object is not callable".format(type(f))
-
- result = [f(v) for k, v in self._iter]
- concat_axis = self._axis if concat_axis is None else concat_axis
-
- new_df = concat(result, axis=concat_axis)
-
- if self._axis == 0:
- new_df._block_partitions = np.array([_reindex_helper._submit(
- args=tuple([new_df.index, self._index, 1,
- len(new_df._block_partitions)] + block.tolist()),
- num_return_vals=len(new_df._block_partitions))
- for block in new_df._block_partitions.T]).T
- new_df.index = self._index
- new_df._row_metadata = \
- _IndexMetadata(new_df._block_partitions[:, 0],
- index=new_df.index, axis=0)
- else:
- new_df._block_partitions = np.array([_reindex_helper._submit(
- args=tuple([new_df.columns, self._columns, 0,
- new_df._block_partitions.shape[1]]
- + block.tolist()),
- num_return_vals=new_df._block_partitions.shape[1])
- for block in new_df._block_partitions])
- new_df.columns = self._columns
- new_df._col_metadata = \
- _IndexMetadata(new_df._block_partitions[0, :],
- index=new_df.columns, axis=1)
-
- return new_df
-
-
-@ray.remote
-def groupby(by, axis, level, as_index, sort, group_keys, squeeze, *df):
-
- df = pandas.concat(df, axis=axis)
-
- return [v for k, v in df.groupby(by=by,
- axis=axis,
- level=level,
- as_index=as_index,
- sort=sort,
- group_keys=group_keys,
- squeeze=squeeze)]
diff --git a/python/ray/dataframe/index_metadata.py b/python/ray/dataframe/index_metadata.py
deleted file mode 100644
index 62bea3f0e..000000000
--- a/python/ray/dataframe/index_metadata.py
+++ /dev/null
@@ -1,415 +0,0 @@
-import pandas
-import numpy as np
-import ray
-
-from .utils import (
- _build_row_lengths,
- _build_col_widths,
- _build_coord_df)
-
-from pandas.core.indexing import convert_to_index_sliceable
-
-
-class _IndexMetadata(object):
- """Wrapper for Pandas indexes in Ray DataFrames. Handles all of the
- metadata specific to the axis of partition (setting indexes,
- calculating the index within partition of a value, etc.). This
- implementation assumes the underlying index lies across multiple
- partitions.
-
- IMPORTANT NOTE: Currently all operations, as implemented, are inplace.
-
- WARNING: Currently, the `_lengths` item is the source of truth for an
- _IndexMetadata object, since it is easy to manage, and that the coord_df
- item may be deprecated in the future. As such, it is _very_ important that
- any functions that mutate the coord_df splits in anyway first modify the
- lengths. Otherwise bad things might happen!
- """
-
- def __init__(self, dfs=None, index=None, axis=0, lengths_oid=None,
- coord_df_oid=None):
- """Inits a IndexMetadata from Ray DataFrame partitions
-
- Args:
- dfs ([ObjectID]): ObjectIDs of dataframe partitions
- index (pandas.Index): Index of the Ray DataFrame.
- axis: Axis of partition (0=row partitions, 1=column partitions)
-
- Returns:
- A IndexMetadata backed by the specified pandas.Index, partitioned
- off specified partitions
- """
- assert (lengths_oid is None) == (coord_df_oid is None), \
- "Must pass both or neither of lengths_oid and coord_df_oid"
-
- if dfs is not None and lengths_oid is None:
- if axis == 0:
- lengths_oid = _build_row_lengths.remote(dfs)
- else:
- lengths_oid = _build_col_widths.remote(dfs)
- coord_df_oid = _build_coord_df.remote(lengths_oid, index)
-
- self._lengths = lengths_oid
- self._coord_df = coord_df_oid
- self._index_cache = index
- self._cached_index = False
-
- def _get__lengths(self):
- if isinstance(self._lengths_cache, ray.ObjectID) or \
- (isinstance(self._lengths_cache, list) and
- isinstance(self._lengths_cache[0], ray.ObjectID)):
- self._lengths_cache = ray.get(self._lengths_cache)
- return self._lengths_cache
-
- def _set__lengths(self, lengths):
- self._lengths_cache = lengths
-
- _lengths = property(_get__lengths, _set__lengths)
-
- def _get__coord_df(self):
- """Get the coordinate dataframe wrapped by this _IndexMetadata.
-
- Since we may have had an index set before our coord_df was
- materialized, we'll have to apply it to the newly materialized df
- """
- if isinstance(self._coord_df_cache, ray.ObjectID):
- self._coord_df_cache = ray.get(self._coord_df_cache)
- if self._cached_index:
- self._coord_df_cache.index = self._index_cache
- self._cached_index = False
- return self._coord_df_cache
-
- def _set__coord_df(self, coord_df):
- """Set the coordinate dataframe wrapped by this _IndexMetadata.
-
- Sometimes we set the _IndexMetadata's coord_df outside of the
- constructor, generally using fxns like drop(). This produces a modified
- index, so we need to reflect the change on the index cache.
-
- If the set _IndexMetadata is an OID instead (due to a copy or whatever
- reason), we fall back relying on `_index_cache`.
- """
- if not isinstance(coord_df, ray.ObjectID):
- self._index_cache = coord_df.index
- self._coord_df_cache = coord_df
-
- _coord_df = property(_get__coord_df, _set__coord_df)
-
- def _get_index(self):
- """Get the index wrapped by this _IndexMetadata.
-
- The only time `self._index_cache` would be None is in a newly created
- _IndexMetadata object without a specified `index` parameter (See the
- _IndexMetadata constructor for more details)
- """
- if isinstance(self._coord_df_cache, ray.ObjectID):
- return self._index_cache
- else:
- return self._coord_df_cache.index
-
- def _set_index(self, new_index):
- """Set the index wrapped by this _IndexMetadata.
-
- It is important to always set `_index_cache` even if the coord_df is
- materialized due to the possibility that it is set to an OID later on.
- This design is more straightforward than caching indexes on setting the
- coord_df to an OID due to the possibility of an OID-to-OID change.
- """
- new_index = pandas.DataFrame(index=new_index).index
- assert len(new_index) == len(self)
-
- self._index_cache = new_index
- if isinstance(self._coord_df_cache, ray.ObjectID):
- self._cached_index = True
- else:
- self._coord_df_cache.index = new_index
-
- index = property(_get_index, _set_index)
-
- def _get_index_cache(self):
- """Get the cached Index object, which may sometimes be an OID.
-
- This will ray.get the Index object out of the Ray store lazily, such
- that it is not grabbed until it is needed in the driver. This layer of
- abstraction is important for allowing this object to be instantiated
- with a remote Index object.
-
- Returns:
- The Index object in _index_cache.
- """
- if self._index_cache_validator is None:
- self._index_cache_validator = pandas.RangeIndex(len(self))
- elif isinstance(self._index_cache_validator,
- ray.ObjectID):
- self._index_cache_validator = ray.get(self._index_cache_validator)
-
- return self._index_cache_validator
-
- def _set_index_cache(self, new_index):
- """Sets the new index cache.
-
- Args:
- new_index: The Index to set the _index_cache to.
- """
- self._index_cache_validator = new_index
-
- # _index_cache_validator is an extra layer of abstraction to allow the
- # cache to accept ObjectIDs and ray.get them when needed.
- _index_cache = property(_get_index_cache, _set_index_cache)
-
- def coords_of(self, key):
- """Returns the coordinates (partition, index_within_partition) of the
- provided key in the index. Can be called on its own or implicitly
- through __getitem__
-
- Args:
- key:
- item to get coordinates of. Can also be a tuple of item
- and {"partition", "index_within_partition"} if caller only
- needs one of the coordinates
-
- Returns:
- Pandas object with the keys specified. If key is a single object
- it will be a pandas.Series with items `partition` and
- `index_within_partition`, and if key is a slice or if the key is
- duplicate it will be a pandas.DataFrame with said items as columns.
- """
- return self._coord_df.loc[key]
-
- def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
- group_keys=True, squeeze=False, **kwargs):
- # TODO: Find out what this does, and write a docstring
- assignments_df = self._coord_df.groupby(by=by, axis=axis, level=level,
- as_index=as_index, sort=sort,
- group_keys=group_keys,
- squeeze=squeeze, **kwargs)\
- .apply(lambda x: x[:])
- return assignments_df
-
- def partition_series(self, partition):
- return self[self._coord_df['partition'] == partition,
- 'index_within_partition']
-
- def __len__(self):
- return int(sum(self._lengths))
-
- def reset_partition_coords(self, partitions=None):
- partitions = np.array(partitions)
-
- for partition in partitions:
- partition_mask = (self._coord_df['partition'] == partition)
- # Since we are replacing columns with RangeIndex inside the
- # partition, we have to make sure that our reference to it is
- # upandasated as well.
- try:
- self._coord_df.loc[partition_mask,
- 'index_within_partition'] = np.arange(
- sum(partition_mask)).astype(int)
- except ValueError:
- # Copy the arrow sealed dataframe so we can mutate it.
- # We only do this the first time we try to mutate the sealed.
- self._coord_df = self._coord_df.copy()
- self._coord_df.loc[partition_mask,
- 'index_within_partition'] = np.arange(
- sum(partition_mask)).astype(int)
-
- def insert(self, key, loc=None, partition=None,
- index_within_partition=None):
- """Inserts a key at a certain location in the index, or a certain coord
- in a partition. Called with either `loc` or `partition` and
- `index_within_partition`. If called with both, `loc` will be used.
-
- Args:
- key: item to insert into index
- loc: location to insert into index
- partition: partition to insert into
- index_within_partition: index within partition to insert into
-
- Returns:
- DataFrame with coordinates of insert
- """
- # Perform insert on a specific partition
- # Determine which partition to place it in, and where in that partition
- if loc is not None:
- cum_lens = np.cumsum(self._lengths)
- if len(cum_lens) > 1:
- partition = np.digitize(loc, cum_lens[:-1], right=True)
- else:
- partition = 0
- if partition >= len(cum_lens):
- if loc > cum_lens[-1]:
- raise IndexError("index {0} is out of bounds".format(loc))
- else:
- index_within_partition = self._lengths[-1]
- else:
- first_in_partition = \
- np.asscalar(np.concatenate(([0], cum_lens))[partition])
- index_within_partition = loc - first_in_partition
-
- # TODO: Stop-gap solution until we begin passing IndexMetadatas
- return partition, index_within_partition
-
- # Generate new index
- new_index = self.index.insert(loc, key)
-
- # Shift indices in partition where we inserted column
- idx_locs = (self._coord_df.partition == partition) & \
- (self._coord_df.index_within_partition ==
- index_within_partition)
- # TODO: Determine why self._coord_df{,_cache} are read-only
- _coord_df_copy = self._coord_df.copy()
- _coord_df_copy.loc[idx_locs, 'index_within_partition'] += 1
-
- # TODO: Determine if there's a better way to do a row-index insert in
- # pandas, because this is very annoying/unsure of efficiency
- # Create new coord entry to insert
- coord_to_insert = pandas.DataFrame(
- {'partition': partition,
- 'index_within_partition': index_within_partition},
- index=[key])
-
- # Insert into cached RangeIndex, and order by new column index
- self._coord_df = _coord_df_copy.append(coord_to_insert).loc[new_index]
-
- # Return inserted coordinate for callee
- return coord_to_insert
-
- def get_global_indices(self, partition, index_within_partition_list):
- total = 0
- for i in range(partition):
- total += self._lengths[i]
-
- return [total + i for i in index_within_partition_list]
-
- def squeeze(self, partition, index_within_partition):
- """Prepare a single coordinate for removal by "squeezing" the
- subsequent coordinates "up" one index within that partition. To be used
- with "_IndexMetadata.drop" for when all the "squeezed" coordinates are
- dropped in batch. Note that this function doesn't actually mutate the
- coord_df.
- """
- self._coord_df = self._coord_df.copy()
-
- partition_mask = self._coord_df.partition == partition
- index_within_partition_mask = \
- self._coord_df.index_within_partition > index_within_partition
- self._coord_df.loc[partition_mask & index_within_partition_mask,
- 'index_within_partition'] -= 1
-
- def copy(self):
- # TODO: Investigate copy-on-write wrapper for metadata objects
- coord_df_copy = self._coord_df_cache
- if not isinstance(self._coord_df_cache, ray.ObjectID):
- coord_df_copy = self._coord_df_cache.copy()
-
- lengths_copy = self._lengths_cache
- if not isinstance(self._lengths_cache, ray.ObjectID):
- lengths_copy = self._lengths_cache.copy()
-
- index_copy = self._index_cache
- if self._index_cache is not None:
- index_copy = self._index_cache.copy()
-
- return _IndexMetadata(index=index_copy,
- coord_df_oid=coord_df_copy,
- lengths_oid=lengths_copy)
-
- def __getitem__(self, key):
- """Returns the coordinates (partition, index_within_partition) of the
- provided key in the index. Essentially just an alias for
- `_IndexMetadata.coords_of` that allows for slice passing, since
- slices cannot be passed with slice notation other than through
- `__getitem__` calls.
-
- Args:
- key:
- item to get coordinates of. Can also be a tuple of item
- and {"partition", "index_within_partition"} if caller only
- needs one of the coordinates
-
- Returns:
- Pandas object with the keys specified. If key is a single object
- it will be a pandas.Series with items `partition` and
- `index_within_partition`, and if key is a slice or if the key is
- duplicate it will be a pandas.DataFrame with said items as columns.
- """
- return self.coords_of(key)
-
- def first_valid_index(self):
- return self._coord_df.first_valid_index()
-
- def last_valid_index(self):
- return self._coord_df.last_valid_index()
-
- def drop(self, labels, errors='raise'):
- """Drop the specified labels from the IndexMetadata
-
- Args:
- labels (scalar or list-like):
- The labels to drop
- errors ('raise' or 'ignore'):
- If 'ignore', suppress errors for when labels don't exist
-
- Returns:
- DataFrame with coordinates of dropped labels
- """
- dropped = self.coords_of(labels)
-
- # Upandasate first lengths to prevent possible length inconsistencies
- if isinstance(dropped, pandas.DataFrame):
- try:
- drop_per_part = dropped.groupby(["partition"]).size()\
- .reindex(index=pandas.RangeIndex(len(self._lengths)),
- fill_value=0)
- except ValueError:
- # Copy the arrow sealed dataframe so we can mutate it.
- dropped = dropped.copy()
- drop_per_part = dropped.groupby(["partition"]).size()\
- .reindex(index=pandas.RangeIndex(len(self._lengths)),
- fill_value=0)
- elif isinstance(dropped, pandas.Series):
- drop_per_part = np.zeros_like(self._lengths)
- drop_per_part[dropped["partition"]] = 1
- else:
- raise AssertionError("Unrecognized result from `coords_of`")
-
- self._lengths = self._lengths - np.array(drop_per_part)
-
- new_coord_df = self._coord_df.drop(labels, errors=errors)
-
- num_dropped = 0
- for i, length in enumerate(self._lengths):
- if length == 0:
- num_dropped += 1
- if num_dropped > 0:
- new_coord_df['partition'][new_coord_df['partition'] == i] \
- -= num_dropped
-
- self._coord_df = new_coord_df
- return dropped
-
- def rename_index(self, mapper):
- """Rename the index.
-
- Args:
- mapper: name to rename the index as
- """
- self._coord_df = self._coord_df.rename_axis(mapper, axis=0)
-
- def convert_to_index_sliceable(self, key):
- """Converts and performs error checking on the passed slice
-
- Args:
- key: slice to convert and check
- """
- return convert_to_index_sliceable(self._coord_df, key)
-
- def get_partition(self, partition_id):
- """Return a view of coord_df where partition = partition_id
- """
- return self._coord_df[self._coord_df.partition == partition_id]
-
- def sorted_index(self):
- return (self._coord_df
- .sort_values(['partition', 'index_within_partition'])
- .index)
diff --git a/python/ray/dataframe/indexing.py b/python/ray/dataframe/indexing.py
deleted file mode 100644
index 2fa2e1e8d..000000000
--- a/python/ray/dataframe/indexing.py
+++ /dev/null
@@ -1,469 +0,0 @@
-"""Indexing Helper Class works as follows:
-
-_Location_Indexer_Base provide methods framework for __getitem__
- and __setitem__ that work with Ray DataFrame's internal index. Base
- class's __{get,set}item__ takes in partitions & idx_in_partition data
- and perform lookup/item write.
-
-_LocIndexer and _iLocIndexer is responsible for indexer specific logic and
- lookup computation. Loc will take care of enlarge dataframe. Both indexer
- will take care of translating pandas's lookup to Ray DataFrame's internal
- lookup.
-
-An illustration is available at
-https://github.com/ray-project/ray/pull/1955#issuecomment-386781826
-"""
-import pandas
-import numpy as np
-import ray
-from warnings import warn
-
-from pandas.api.types import (is_scalar, is_list_like, is_bool)
-from pandas.core.dtypes.common import is_integer
-from pandas.core.indexing import IndexingError
-
-from .utils import (_blocks_to_col, _get_nan_block_id, extractor,
- _mask_block_partitions, writer)
-from .index_metadata import _IndexMetadata
-from .dataframe import DataFrame
-
-
-def is_slice(x): return isinstance(x, slice)
-
-
-def is_2d(x): return is_list_like(x) or is_slice(x)
-
-
-def is_tuple(x): return isinstance(x, tuple)
-
-
-def is_boolean_array(x): return is_list_like(x) and all(map(is_bool, x))
-
-
-def is_integer_slice(x):
- if not is_slice(x):
- return False
- for pos in [x.start, x.stop, x.step]:
- if not ((pos is None) or is_integer(pos)):
- return False # one position is neither None nor int
- return True
-
-
-_ENLARGEMENT_WARNING = """
-Passing list-likes to .loc or [] with any missing label will raise
-KeyError in the future, you can use .reindex() as an alternative.
-
-See the documentation here:
-http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
-"""
-
-_ILOC_INT_ONLY_ERROR = """
-Location based indexing can only have [integer, integer slice (START point is
-INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types.
-"""
-
-
-def _parse_tuple(tup):
- """Unpack the user input for getitem and setitem and compute ndim
-
- loc[a] -> ([a], :), 1D
- loc[[a,b],] -> ([a,b], :),
- loc[a,b] -> ([a], [b]), 0D
- """
- row_loc, col_loc = slice(None), slice(None)
-
- if is_tuple(tup):
- row_loc = tup[0]
- if len(tup) == 2:
- col_loc = tup[1]
- if len(tup) > 2:
- raise IndexingError('Too many indexers')
- else:
- row_loc = tup
-
- ndim = _compute_ndim(row_loc, col_loc)
- row_loc = [row_loc] if is_scalar(row_loc) else row_loc
- col_loc = [col_loc] if is_scalar(col_loc) else col_loc
-
- return row_loc, col_loc, ndim
-
-
-def _is_enlargement(locator, coord_df):
- """Determine if a locator will enlarge the corrd_df.
-
- Enlargement happens when you trying to locate using labels isn't in the
- original index. In other words, enlargement == adding NaNs !
- """
- if is_list_like(locator) and not is_slice(
- locator) and len(locator) > 0 and not is_boolean_array(locator):
- n_diff_elems = len(pandas.Index(locator).difference(coord_df.index))
- is_enlargement_boolean = n_diff_elems > 0
- return is_enlargement_boolean
- return False
-
-
-def _warn_enlargement():
- warn(FutureWarning(_ENLARGEMENT_WARNING))
-
-
-def _compute_ndim(row_loc, col_loc):
- """Compute the ndim of result from locators
- """
- row_scaler = is_scalar(row_loc)
- col_scaler = is_scalar(col_loc)
-
- if row_scaler and col_scaler:
- ndim = 0
- elif row_scaler ^ col_scaler:
- ndim = 1
- else:
- ndim = 2
-
- return ndim
-
-
-class _Location_Indexer_Base():
- """Base class for location indexer like loc and iloc
- """
-
- def __init__(self, ray_df):
- self.df = ray_df
- self.col_coord_df = ray_df._col_metadata._coord_df
- self.row_coord_df = ray_df._row_metadata._coord_df
- self.block_oids = ray_df._block_partitions
-
- self.is_view = False
- if isinstance(ray_df, DataFrameView):
- self.block_oids = ray_df._block_partitions_data
- self.is_view = True
-
- def __getitem__(self, row_lookup, col_lookup, ndim):
- """
- Args:
- row_lookup: A pandas dataframe, a partial view from row_coord_df
- col_lookup: A pandas dataframe, a partial view from col_coord_df
- ndim: the dimension of returned data
- """
- if ndim == 2:
- return self._generate_view(row_lookup, col_lookup)
-
- extracted = self._retrive_items(row_lookup, col_lookup)
- if ndim == 1:
- result = ray.get(_blocks_to_col.remote(*extracted)).squeeze()
-
- if is_scalar(result):
- result = pandas.Series(result)
-
- scaler_axis = row_lookup if len(row_lookup) == 1 else col_lookup
- series_name = scaler_axis.iloc[0].name
- result.name = series_name
-
- index_axis = row_lookup if len(col_lookup) == 1 else col_lookup
- result.index = index_axis.index
-
- if ndim == 0:
- result = ray.get(extracted[0]).squeeze()
-
- return result
-
- def _retrive_items(self, row_lookup, col_lookup):
- """Given lookup dataframes, return a list of result oids
- """
- result_oids = []
-
- # We have to copy before we groupby because
- # https://github.com/pandas-dev/pandas/issues/10043
- row_groups = row_lookup.copy().groupby('partition')
- col_groups = col_lookup.copy().groupby('partition')
- for row_blk, row_data in row_groups:
- for col_blk, col_data in col_groups:
- block_oid = self.block_oids[row_blk, col_blk]
- row_idx = row_data['index_within_partition']
- col_idx = col_data['index_within_partition']
-
- result_oid = extractor.remote(block_oid, row_idx, col_idx)
- result_oids.append(result_oid)
- return result_oids
-
- def _generate_view(self, row_lookup, col_lookup):
- """Generate a DataFrameView from lookup
- """
- row_lengths = [0] * len(self.df._row_metadata._lengths)
- for i in row_lookup["partition"]:
- row_lengths[i] += 1
- col_lengths = [0] * len(self.df._col_metadata._lengths)
- for i in col_lookup["partition"]:
- col_lengths[i] += 1
-
- row_metadata_view = _IndexMetadata(
- coord_df_oid=row_lookup, lengths_oid=row_lengths)
-
- col_metadata_view = _IndexMetadata(
- coord_df_oid=col_lookup, lengths_oid=col_lengths)
-
- df_view = DataFrameView(
- block_partitions=self.block_oids,
- row_metadata=row_metadata_view,
- col_metadata=col_metadata_view,
- index=row_metadata_view.index,
- columns=col_metadata_view.index)
-
- return df_view
-
- def __setitem__(self, row_lookup, col_lookup, item):
- """
- Args:
- row_lookup: A pandas dataframe, a partial view from row_coord_df
- col_lookup: A pandas dataframe, a partial view from col_coord_df
- item: The new item needs to be set. It can be any shape that's
- broadcastable to the product of the lookup tables.
- """
- to_shape = (len(row_lookup), len(col_lookup))
- item = self._broadcast_item(item, to_shape)
- self._write_items(row_lookup, col_lookup, item)
-
- def _broadcast_item(self, item, to_shape):
- """Use numpy to broadcast or reshape item.
-
- Notes:
- - Numpy is memory efficent, there shouldn't be performance issue.
- """
- try:
- item = np.array(item)
- if np.prod(to_shape) == np.prod(item.shape):
- return item.reshape(to_shape)
- else:
- return np.broadcast_to(item, to_shape)
- except ValueError:
- from_shape = np.array(item).shape
- raise ValueError(
- "could not broadcast input array from \
- shape {from_shape} into shape {to_shape}".format(
- from_shape=from_shape, to_shape=to_shape))
-
- def _write_items(self, row_lookup, col_lookup, item):
- """Perform remote write and replace blocks.
- """
-
- # We have to copy before we groupby because
- # https://github.com/pandas-dev/pandas/issues/10043
- row_groups = row_lookup.copy().groupby('partition')
- col_groups = col_lookup.copy().groupby('partition')
-
- row_item_index = 0
- for row_blk, row_data in row_groups:
- row_len = len(row_data)
-
- col_item_index = 0
- for col_blk, col_data in col_groups:
- col_len = len(col_data)
-
- block_oid = self.block_oids[row_blk, col_blk]
- row_idx = row_data['index_within_partition']
- col_idx = col_data['index_within_partition']
-
- item_to_write = item[row_item_index:row_item_index + row_len,
- col_item_index:col_item_index + col_len]
-
- result_oid = writer.remote(block_oid, row_idx, col_idx,
- item_to_write)
-
- if self.is_view:
- self.df._block_partitions_data[row_blk,
- col_blk] = result_oid
- else:
- self.df._block_partitions[row_blk, col_blk] = result_oid
-
- col_item_index += col_len
- row_item_index += row_len
-
-
-class _Loc_Indexer(_Location_Indexer_Base):
- """A indexer for ray_df.loc[] functionality"""
-
- def __getitem__(self, key):
- row_loc, col_loc, ndim = _parse_tuple(key)
- self._handle_enlargement(row_loc, col_loc)
- row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
- ndim = self._expand_dim(row_lookup, col_lookup, ndim)
- result = super(_Loc_Indexer, self).__getitem__(row_lookup, col_lookup,
- ndim)
- return result
-
- def __setitem__(self, key, item):
- row_loc, col_loc, _ = _parse_tuple(key)
- self._handle_enlargement(row_loc, col_loc)
- row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
- super(_Loc_Indexer, self).__setitem__(row_lookup, col_lookup,
- item)
-
- def _handle_enlargement(self, row_loc, col_loc):
- """Handle Enlargement (if there is one).
-
- Returns:
- None
- """
- locators = [row_loc, col_loc]
- coord_dfs = [self.row_coord_df, self.col_coord_df]
- axis = ['row', 'col']
- metadata = {'row': self.df._row_metadata, 'col': self.df._col_metadata}
-
- for loc, coord, axis in zip(locators, coord_dfs, axis):
- if _is_enlargement(loc, coord):
- new_meta = self._enlarge_axis(loc, axis=axis)
- _warn_enlargement()
- metadata[axis] = new_meta
-
- self.row_coord_df = metadata['row']._coord_df
- self.col_coord_df = metadata['col']._coord_df
-
- def _enlarge_axis(self, locator, axis):
- """Add rows/columns to block partitions according to locator.
-
- Returns:
- metadata (_IndexMetadata)
- """
- # 1. Prepare variables
- row_based_bool = axis == 'row'
- # major == the axis of the locator
- major_meta = self.df._row_metadata if row_based_bool \
- else self.df._col_metadata
- minor_meta = self.df._col_metadata if row_based_bool \
- else self.df._row_metadata
-
- # 2. Compute the nan labels and add blocks
- nan_labels = self._compute_enlarge_labels(locator, major_meta.index)
- num_nan_labels = len(nan_labels)
- blk_part_n_row, blk_part_n_col = self.block_oids.shape
-
- nan_blk_lens = minor_meta._lengths
- nan_blks = np.array([[
- _get_nan_block_id(
- num_nan_labels, n_cols, transpose=not row_based_bool)
- for n_cols in nan_blk_lens
- ]])
- nan_blks = nan_blks.T if not row_based_bool else nan_blks
-
- self.block_oids = np.concatenate(
- [self.block_oids, nan_blks], axis=0 if row_based_bool else 1)
-
- # 3. Prepare metadata to return
- nan_coord_df = pandas.DataFrame(data=[{
- '': name,
- 'partition': blk_part_n_row if row_based_bool else blk_part_n_col,
- 'index_within_partition': i
- } for name, i in zip(nan_labels, np.arange(num_nan_labels))
- ]).set_index('')
-
- coord_df = pandas.concat([major_meta._coord_df, nan_coord_df])
- coord_df = coord_df.loc[locator] # Re-index that allows duplicates
-
- lens = major_meta._lengths
- lens = np.concatenate([lens, np.array([num_nan_labels])])
-
- metadata_view = _IndexMetadata(coord_df_oid=coord_df, lengths_oid=lens)
- return metadata_view
-
- def _compute_enlarge_labels(self, locator, base_index):
- """Helper for _enlarge_axis, compute common labels and extra labels.
-
- Returns:
- nan_labels: The labels needs to be added
- """
- # base_index_type can be pd.Index or pd.DatetimeIndex
- # depending on user input and pandas behavior
- # See issue #2264
- base_index_type = type(base_index)
- locator_as_index = base_index_type(locator)
-
- nan_labels = locator_as_index.difference(base_index)
- common_labels = locator_as_index.intersection(base_index)
-
- if len(common_labels) == 0:
- raise KeyError(
- 'None of [{labels}] are in the [{base_index_name}]'.format(
- labels=list(locator_as_index), base_index_name=base_index))
-
- return nan_labels
-
- def _expand_dim(self, row_lookup, col_lookup, ndim):
- """Expand the dimension if necessary.
- This method is for cases like duplicate labels.
- """
- many_rows = len(row_lookup) > 1
- many_cols = len(col_lookup) > 1
-
- if ndim == 0 and (many_rows or many_cols):
- ndim = 1
- if ndim == 1 and (many_rows and many_cols):
- ndim = 2
-
- return ndim
-
- def _compute_lookup(self, row_loc, col_loc):
- # We use reindex for list to avoid duplicates.
- row_lookup = self.row_coord_df.loc[row_loc]
- col_lookup = self.col_coord_df.loc[col_loc]
- return row_lookup, col_lookup
-
-
-class _iLoc_Indexer(_Location_Indexer_Base):
- """A indexer for ray_df.iloc[] functionality"""
-
- def __getitem__(self, key):
- row_loc, col_loc, ndim = _parse_tuple(key)
-
- self._check_dtypes(row_loc)
- self._check_dtypes(col_loc)
-
- row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
- result = super(_iLoc_Indexer, self).__getitem__(
- row_lookup, col_lookup, ndim)
- return result
-
- def __setitem__(self, key, item):
- row_loc, col_loc, _ = _parse_tuple(key)
-
- self._check_dtypes(row_loc)
- self._check_dtypes(col_loc)
-
- row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
- super(_iLoc_Indexer, self).__setitem__(
- row_lookup, col_lookup, item)
-
- def _compute_lookup(self, row_loc, col_loc):
- # We use reindex for list to avoid duplicates.
- return self.row_coord_df.iloc[row_loc], self.col_coord_df.iloc[col_loc]
-
- def _check_dtypes(self, locator):
- is_int = is_integer(locator)
- is_int_slice = is_integer_slice(locator)
- is_int_list = is_list_like(locator) and all(map(is_integer, locator))
- is_bool_arr = is_boolean_array(locator)
-
- if not any([is_int, is_int_slice, is_int_list, is_bool_arr]):
- raise ValueError(_ILOC_INT_ONLY_ERROR)
-
-
-class DataFrameView(DataFrame):
- """A subclass of DataFrame where the index can be smaller than blocks.
- """
-
- def __init__(self, block_partitions, row_metadata, col_metadata, index,
- columns):
- self._block_partitions = block_partitions
- self._row_metadata = row_metadata
- self._col_metadata = col_metadata
- self.index = index
- self.columns = columns
-
- def _get_block_partitions(self):
- oid_arr = _mask_block_partitions(self._block_partitions_data,
- self._row_metadata,
- self._col_metadata)
- return oid_arr
-
- def _set_block_partitions(self, new_block_partitions):
- self._block_partitions_data = new_block_partitions
-
- _block_partitions = property(_get_block_partitions, _set_block_partitions)
diff --git a/python/ray/dataframe/io.py b/python/ray/dataframe/io.py
deleted file mode 100644
index 8b1bdbb4e..000000000
--- a/python/ray/dataframe/io.py
+++ /dev/null
@@ -1,524 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from itertools import chain
-from io import BytesIO
-import os
-import re
-import warnings
-import pandas
-
-from pyarrow.parquet import ParquetFile
-from pandas.io.common import _infer_compression # don't depend on internal API
-
-
-from .dataframe import ray, DataFrame
-from . import get_npartitions
-from .utils import from_pandas
-
-
-# Parquet
-def read_parquet(path, engine='auto', columns=None, **kwargs):
- """Load a parquet object from the file path, returning a DataFrame.
- Ray DataFrame only supports pyarrow engine for now.
-
- Args:
- path: The filepath of the parquet file.
- We only support local files for now.
- engine: Ray only support pyarrow reader.
- This argument doesn't do anything for now.
- kwargs: Pass into parquet's read_row_group function.
- """
- pf = ParquetFile(path)
-
- n_rows = pf.metadata.num_rows
- chunksize = n_rows // get_npartitions()
- n_row_groups = pf.metadata.num_row_groups
-
- idx_regex = re.compile('__index_level_\d+__')
- columns = [
- name for name in pf.metadata.schema.names if not idx_regex.match(name)
- ]
-
- df_from_row_groups = [
- _read_parquet_row_group.remote(path, columns, i, kwargs)
- for i in range(n_row_groups)
- ]
- splited_dfs = ray.get(
- [_split_df.remote(df, chunksize) for df in df_from_row_groups])
- df_remotes = list(chain.from_iterable(splited_dfs))
-
- return DataFrame(row_partitions=df_remotes, columns=columns)
-
-
-@ray.remote
-def _read_parquet_row_group(path, columns, row_group_id, kwargs={}):
- """Read a parquet row_group given file_path.
- """
- pf = ParquetFile(path)
- df = pf.read_row_group(row_group_id, columns=columns, **kwargs).to_pandas()
- return df
-
-
-@ray.remote
-def _split_df(pandas_df, chunksize):
- """Split a pandas_df into partitions.
-
- Returns:
- remote_df_ids ([ObjectID])
- """
- dataframes = []
-
- while len(pandas_df) > chunksize:
- t_df = pandas_df[:chunksize]
- t_df.reset_index(drop=True)
- top = ray.put(t_df)
- dataframes.append(top)
- pandas_df = pandas_df[chunksize:]
- else:
- pandas_df = pandas_df.reset_index(drop=True)
- dataframes.append(ray.put(pandas_df))
-
- return dataframes
-
-
-# CSV
-def _compute_offset(fn, npartitions, ignore_first_line=False):
- """
- Calculate the currect bytes offsets for a csv file.
- Return a list of (start, end) tuple where the end == \n or EOF.
- """
- total_bytes = os.path.getsize(fn)
- bio = open(fn, 'rb')
- if ignore_first_line:
- start = len(bio.readline())
- chunksize = (total_bytes - start) // npartitions
- else:
- start = 0
- chunksize = total_bytes // npartitions
- if chunksize == 0:
- chunksize = 1
-
- offsets = []
- while start < total_bytes:
- bio.seek(chunksize, 1) # Move forward {chunksize} bytes
- extend_line = bio.readline() # Move after the next \n
- total_offset = chunksize + len(extend_line)
- # The position of the \n we just crossed.
- new_line_cursor = start + total_offset - 1
- offsets.append((start, new_line_cursor))
- start = new_line_cursor + 1
-
- bio.close()
- return offsets
-
-
-def _get_firstline(file_path):
- bio = open(file_path, 'rb')
- first = bio.readline()
- bio.close()
- return first
-
-
-def _infer_column(first_line, kwargs={}):
- return pandas.read_csv(BytesIO(first_line), **kwargs).columns
-
-
-@ray.remote
-def _read_csv_with_offset(fn, start, end, kwargs={}, header=b''):
- kwargs["quoting"] = int(kwargs["quoting"]) # See issue #2078
-
- bio = open(fn, 'rb')
- bio.seek(start)
- to_read = header + bio.read(end - start)
- bio.close()
- pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs)
- index = pandas_df.index
- # Partitions must have RangeIndex
- pandas_df.index = pandas.RangeIndex(0, len(pandas_df))
- return pandas_df, index
-
-
-@ray.remote
-def get_index(*partition_indices):
- return partition_indices[0].append(partition_indices[1:])
-
-
-def read_csv(filepath_or_buffer,
- sep=',',
- delimiter=None,
- header='infer',
- names=None,
- index_col=None,
- usecols=None,
- squeeze=False,
- prefix=None,
- mangle_dupe_cols=True,
- dtype=None,
- engine=None,
- converters=None,
- true_values=None,
- false_values=None,
- skipinitialspace=False,
- skiprows=None,
- nrows=None,
- na_values=None,
- keep_default_na=True,
- na_filter=True,
- verbose=False,
- skip_blank_lines=True,
- parse_dates=False,
- infer_datetime_format=False,
- keep_date_col=False,
- date_parser=None,
- dayfirst=False,
- iterator=False,
- chunksize=None,
- compression='infer',
- thousands=None,
- decimal=b'.',
- lineterminator=None,
- quotechar='"',
- quoting=0,
- escapechar=None,
- comment=None,
- encoding=None,
- dialect=None,
- tupleize_cols=None,
- error_bad_lines=True,
- warn_bad_lines=True,
- skipfooter=0,
- skip_footer=0,
- doublequote=True,
- delim_whitespace=False,
- as_recarray=None,
- compact_ints=None,
- use_unsigned=None,
- low_memory=True,
- buffer_lines=None,
- memory_map=False,
- float_precision=None):
- """Read csv file from local disk.
-
- Args:
- filepath:
- The filepath of the csv file.
- We only support local files for now.
- kwargs: Keyword arguments in pandas::from_csv
- """
-
- kwargs = {
- 'sep': sep,
- 'delimiter': delimiter,
- 'header': header,
- 'names': names,
- 'index_col': index_col,
- 'usecols': usecols,
- 'squeeze': squeeze,
- 'prefix': prefix,
- 'mangle_dupe_cols': mangle_dupe_cols,
- 'dtype': dtype,
- 'engine': engine,
- 'converters': converters,
- 'true_values': true_values,
- 'false_values': false_values,
- 'skipinitialspace': skipinitialspace,
- 'skiprows': skiprows,
- 'nrows': nrows,
- 'na_values': na_values,
- 'keep_default_na': keep_default_na,
- 'na_filter': na_filter,
- 'verbose': verbose,
- 'skip_blank_lines': skip_blank_lines,
- 'parse_dates': parse_dates,
- 'infer_datetime_format': infer_datetime_format,
- 'keep_date_col': keep_date_col,
- 'date_parser': date_parser,
- 'dayfirst': dayfirst,
- 'iterator': iterator,
- 'chunksize': chunksize,
- 'compression': compression,
- 'thousands': thousands,
- 'decimal': decimal,
- 'lineterminator': lineterminator,
- 'quotechar': quotechar,
- 'quoting': quoting,
- 'escapechar': escapechar,
- 'comment': comment,
- 'encoding': encoding,
- 'dialect': dialect,
- 'tupleize_cols': tupleize_cols,
- 'error_bad_lines': error_bad_lines,
- 'warn_bad_lines': warn_bad_lines,
- 'skipfooter': skipfooter,
- 'skip_footer': skip_footer,
- 'doublequote': doublequote,
- 'delim_whitespace': delim_whitespace,
- 'as_recarray': as_recarray,
- 'compact_ints': compact_ints,
- 'use_unsigned': use_unsigned,
- 'low_memory': low_memory,
- 'buffer_lines': buffer_lines,
- 'memory_map': memory_map,
- 'float_precision': float_precision,
- }
-
- # Default to Pandas read_csv for non-serializable objects
- if not isinstance(filepath_or_buffer, str) or \
- _infer_compression(filepath_or_buffer, compression) is not None:
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- pandas_obj = pandas.read_csv(filepath_or_buffer, **kwargs)
- if isinstance(pandas_obj, pandas.DataFrame):
- return from_pandas(pandas_obj, get_npartitions())
-
- return pandas_obj
-
- filepath = filepath_or_buffer
-
- # TODO: handle case where header is a list of lines
- first_line = _get_firstline(filepath)
- columns = _infer_column(first_line, kwargs=kwargs)
- if header is None or (header == "infer" and names is not None):
- first_line = b""
- ignore_first_line = False
- else:
- ignore_first_line = True
-
- offsets = _compute_offset(filepath, get_npartitions(),
- ignore_first_line=ignore_first_line)
-
- # Serialize objects to speed up later use in remote tasks
- first_line_id = ray.put(first_line)
- kwargs_id = ray.put(kwargs)
-
- df_obj_ids = []
- index_obj_ids = []
- for start, end in offsets:
- if start != 0:
- df, index = _read_csv_with_offset._submit(
- args=(filepath, start, end, kwargs_id, first_line_id),
- num_return_vals=2)
- else:
- df, index = _read_csv_with_offset._submit(
- args=(filepath, start, end, kwargs_id),
- num_return_vals=2)
- df_obj_ids.append(df)
- index_obj_ids.append(index)
-
- index = get_index.remote(*index_obj_ids) if index_col is not None else None
-
- return DataFrame(row_partitions=df_obj_ids, columns=columns, index=index)
-
-
-def read_json(path_or_buf=None,
- orient=None,
- typ='frame',
- dtype=True,
- convert_axes=True,
- convert_dates=True,
- keep_default_dates=True,
- numpy=False,
- precise_float=False,
- date_unit=None,
- encoding=None,
- lines=False,
- chunksize=None,
- compression='infer'):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = pandas.read_json(path_or_buf, orient, typ, dtype,
- convert_axes, convert_dates,
- keep_default_dates, numpy, precise_float,
- date_unit, encoding, lines, chunksize,
- compression)
- ray_frame = from_pandas(port_frame, get_npartitions())
-
- return ray_frame
-
-
-def read_html(io,
- match='.+',
- flavor=None,
- header=None,
- index_col=None,
- skiprows=None,
- attrs=None,
- parse_dates=False,
- tupleize_cols=None,
- thousands=',',
- encoding=None,
- decimal='.',
- converters=None,
- na_values=None,
- keep_default_na=True):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = pandas.read_html(io, match, flavor, header, index_col,
- skiprows, attrs, parse_dates, tupleize_cols,
- thousands, encoding, decimal, converters,
- na_values, keep_default_na)
- ray_frame = from_pandas(port_frame[0], get_npartitions())
-
- return ray_frame
-
-
-def read_clipboard(sep=r'\s+'):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = pandas.read_clipboard(sep)
- ray_frame = from_pandas(port_frame, get_npartitions())
-
- return ray_frame
-
-
-def read_excel(io,
- sheet_name=0,
- header=0,
- skiprows=None,
- skip_footer=0,
- index_col=None,
- names=None,
- usecols=None,
- parse_dates=False,
- date_parser=None,
- na_values=None,
- thousands=None,
- convert_float=True,
- converters=None,
- dtype=None,
- true_values=None,
- false_values=None,
- engine=None,
- squeeze=False):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = pandas.read_excel(io, sheet_name, header, skiprows,
- skip_footer, index_col, names, usecols,
- parse_dates, date_parser, na_values,
- thousands, convert_float, converters, dtype,
- true_values, false_values, engine, squeeze)
- ray_frame = from_pandas(port_frame, get_npartitions())
-
- return ray_frame
-
-
-def read_hdf(path_or_buf,
- key=None,
- mode='r'):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = pandas.read_hdf(path_or_buf, key, mode)
- ray_frame = from_pandas(port_frame, get_npartitions())
-
- return ray_frame
-
-
-def read_feather(path,
- nthreads=1):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = pandas.read_feather(path)
- ray_frame = from_pandas(port_frame, get_npartitions())
-
- return ray_frame
-
-
-def read_msgpack(path_or_buf,
- encoding='utf-8',
- iterator=False):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = pandas.read_msgpack(path_or_buf, encoding, iterator)
- ray_frame = from_pandas(port_frame, get_npartitions())
-
- return ray_frame
-
-
-def read_stata(filepath_or_buffer,
- convert_dates=True,
- convert_categoricals=True,
- encoding=None,
- index_col=None,
- convert_missing=False,
- preserve_dtypes=True,
- columns=None,
- order_categoricals=True,
- chunksize=None,
- iterator=False):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = pandas.read_stata(filepath_or_buffer, convert_dates,
- convert_categoricals, encoding, index_col,
- convert_missing, preserve_dtypes, columns,
- order_categoricals, chunksize, iterator)
- ray_frame = from_pandas(port_frame, get_npartitions())
-
- return ray_frame
-
-
-def read_sas(filepath_or_buffer,
- format=None,
- index=None,
- encoding=None,
- chunksize=None,
- iterator=False):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = pandas.read_sas(filepath_or_buffer, format, index, encoding,
- chunksize, iterator)
- ray_frame = from_pandas(port_frame, get_npartitions())
-
- return ray_frame
-
-
-def read_pickle(path,
- compression='infer'):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = pandas.read_pickle(path, compression)
- ray_frame = from_pandas(port_frame, get_npartitions())
-
- return ray_frame
-
-
-def read_sql(sql,
- con,
- index_col=None,
- coerce_float=True,
- params=None,
- parse_dates=None,
- columns=None,
- chunksize=None):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = pandas.read_sql(sql, con, index_col, coerce_float, params,
- parse_dates, columns, chunksize)
- ray_frame = from_pandas(port_frame, get_npartitions())
-
- return ray_frame
diff --git a/python/ray/dataframe/iterator.py b/python/ray/dataframe/iterator.py
deleted file mode 100644
index b26d1cf46..000000000
--- a/python/ray/dataframe/iterator.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from collections import Iterator
-
-
-class PartitionIterator(Iterator):
- def __init__(self, partitions, func):
- """PartitionIterator class to define a generator on partitioned data
-
- Args:
- partitions ([ObjectID]): Partitions to iterate over
- func (callable): The function to get inner iterables from
- each partition
- """
- self.partitions = iter(partitions)
- self.func = func
- self.iter_cache = iter([])
-
- def __iter__(self):
- return self
-
- def __next__(self):
- return self.next()
-
- def next(self):
- try:
- return next(self.iter_cache)
- except StopIteration:
- next_partition = next(self.partitions)
- self.iter_cache = self.func(next_partition)
- return self.next()
diff --git a/python/ray/dataframe/pandas_code_gen.py b/python/ray/dataframe/pandas_code_gen.py
deleted file mode 100644
index 8a122e530..000000000
--- a/python/ray/dataframe/pandas_code_gen.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import inspect
-
-
-def code_gen(pandas_obj, ray_obj, path):
- """Generate code skeleton for methods not in Ray
-
- Args:
- pandas_obj: The pandas object to generate code from.
- ray_obj: The ray object to diff against.
- path: Path to output the file to.
- """
-
- with open(path, "w") as outfile:
- funcs = pandas_ray_diff(pandas_obj, ray_obj)
-
- for func in funcs:
- if func[0] == "_" and func[1] != "_":
- continue
- if "attr" in func:
- # let's not mess with these
- continue
- try:
- outfile.write(
- "\ndef " + func +
- str(inspect.signature(getattr(pandas_obj, func))) + ":\n")
-
- except TypeError:
- outfile.write("\n@property")
- outfile.write("\ndef " + func + "(self):\n")
- except ValueError:
- continue
- outfile.write(
- " raise NotImplementedError(\"Not Yet implemented.\")\n")
-
-
-def code_gen_test(ray_obj, path, name):
- """Generate tests for methods in Ray."""
-
- with open(path, "a") as outfile:
- funcs = dir(ray_obj)
-
- for func in funcs:
- if func[0] == "_" and func[1] != "_":
- continue
-
- outfile.write("\n\ndef test_" + func + "():\n")
- outfile.write(
- " ray_" + name + " = create_test_" + name + "()\n\n" +
- " with pytest.raises(NotImplementedError):\n" +
- " ray_" + name + "." + func)
- try:
- first = True
- param_num = \
- len(inspect.signature(getattr(ray_obj, func)).parameters)
- if param_num > 1:
- param_num -= 1
-
- for _ in range(param_num):
- if first:
- outfile.write("(None")
- first = False
- else:
- outfile.write(", None")
- except (TypeError, ValueError, NotImplementedError):
- outfile.write("\n")
- continue
-
- if first:
- outfile.write("(")
- outfile.write(")\n")
-
-
-def pandas_ray_diff(pandas_obj, ray_obj):
- """Gets the diff of the methods in the Pandas and Ray objects.
-
- Args:
- pandas_obj: The Pandas object to diff.
- ray_obj: The Ray object to diff.
-
- Returns:
- A list of method names that are different between the two.
- """
- pandas_funcs = dir(pandas_obj)
- ray_funcs = dir(ray_obj)
-
- pandas_funcs = set(filter(lambda f: f[0] != "_" or f[1] == "_",
- pandas_funcs))
-
- diff = [x for x in pandas_funcs if x not in set(ray_funcs)]
- return diff
diff --git a/python/ray/dataframe/reshape.py b/python/ray/dataframe/reshape.py
deleted file mode 100644
index 1883f11b7..000000000
--- a/python/ray/dataframe/reshape.py
+++ /dev/null
@@ -1,125 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ray
-import pandas
-import numpy as np
-
-from pandas import compat
-from pandas.core.dtypes.common import is_list_like
-from itertools import cycle
-
-from .dataframe import DataFrame
-from .utils import _deploy_func
-
-
-def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
- columns=None, sparse=False, drop_first=False):
- """Convert categorical variable into indicator variables.
-
- Args:
- data (array-like, Series, or DataFrame): data to encode.
- prefix (string, [string]): Prefix to apply to each encoded column
- label.
- prefix_sep (string, [string]): Separator between prefix and value.
- dummy_na (bool): Add a column to indicate NaNs.
- columns: Which columns to encode.
- sparse (bool): Not Implemented: If True, returns SparseDataFrame.
- drop_first (bool): Whether to remove the first level of encoded data.
-
- Returns:
- DataFrame or one-hot encoded data.
- """
- if not isinstance(data, DataFrame):
- return pandas.get_dummies(data, prefix=prefix, prefix_sep=prefix_sep,
- dummy_na=dummy_na, columns=columns,
- sparse=sparse, drop_first=drop_first)
-
- if sparse:
- raise NotImplementedError(
- "SparseDataFrame is not implemented. "
- "To contribute to Pandas on Ray, please visit "
- "github.com/ray-project/ray.")
-
- if columns is None:
- columns_to_encode = data.dtypes.isin([np.dtype("O"), 'category'])
- columns_to_encode = data.columns[columns_to_encode]
- else:
- columns_to_encode = columns
-
- def check_len(item, name):
- len_msg = ("Length of '{name}' ({len_item}) did not match the "
- "length of the columns being encoded ({len_enc}).")
-
- if is_list_like(item):
- if not len(item) == len(columns_to_encode):
- len_msg = len_msg.format(name=name, len_item=len(item),
- len_enc=len(columns_to_encode))
- raise ValueError(len_msg)
-
- check_len(prefix, 'prefix')
- check_len(prefix_sep, 'prefix_sep')
- if isinstance(prefix, compat.string_types):
- prefix = cycle([prefix])
- prefix = [next(prefix) for i in range(len(columns_to_encode))]
- if isinstance(prefix, dict):
- prefix = [prefix[col] for col in columns_to_encode]
-
- if prefix is None:
- prefix = columns_to_encode
-
- # validate separators
- if isinstance(prefix_sep, compat.string_types):
- prefix_sep = cycle([prefix_sep])
- prefix_sep = [next(prefix_sep) for i in range(len(columns_to_encode))]
- elif isinstance(prefix_sep, dict):
- prefix_sep = [prefix_sep[col] for col in columns_to_encode]
-
- if set(columns_to_encode) == set(data.columns):
- with_dummies = []
- dropped_columns = pandas.Index()
- else:
- with_dummies = data.drop(columns_to_encode, axis=1)._col_partitions
- dropped_columns = data.columns.drop(columns_to_encode)
-
- def get_dummies_remote(df, to_drop, prefix, prefix_sep):
- df = df.drop(to_drop, axis=1)
-
- if df.size == 0:
- return df, df.columns
-
- df = pandas.get_dummies(df, prefix=prefix, prefix_sep=prefix_sep,
- dummy_na=dummy_na, columns=None, sparse=sparse,
- drop_first=drop_first)
- columns = df.columns
- df.columns = pandas.RangeIndex(0, len(df.columns))
- return df, columns
-
- total = 0
- columns = []
- for i, part in enumerate(data._col_partitions):
- col_index = data._col_metadata.partition_series(i)
-
- # TODO(kunalgosar): Handle the case of duplicate columns here
- to_encode = col_index.index.isin(columns_to_encode)
-
- to_encode = col_index[to_encode]
- to_drop = col_index.drop(to_encode.index)
-
- result = _deploy_func._submit(
- args=(get_dummies_remote, part, to_drop,
- prefix[total:total + len(to_encode)],
- prefix_sep[total:total + len(to_encode)]),
- num_return_vals=2)
-
- with_dummies.append(result[0])
- columns.append(result[1])
- total += len(to_encode)
-
- columns = ray.get(columns)
- dropped_columns = dropped_columns.append(columns)
-
- return DataFrame(col_partitions=with_dummies,
- columns=dropped_columns,
- index=data.index)
diff --git a/python/ray/dataframe/series.py b/python/ray/dataframe/series.py
deleted file mode 100644
index aa102363d..000000000
--- a/python/ray/dataframe/series.py
+++ /dev/null
@@ -1,970 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import pandas
-
-from .utils import _inherit_docstrings
-
-
-def na_op():
- """Pandas uses a similar function to handle na values.
- """
- raise NotImplementedError("Not Yet implemented.")
-
-
-@_inherit_docstrings(pandas.Series, excluded=[pandas.Series,
- pandas.Series.__init__])
-class Series(object):
-
- def __init__(self, series_oids):
- """Constructor for a Series object.
-
- Args:
- series_oids ([ObjectID]): The list of remote Series objects.
- """
- self.series_oids = series_oids
-
- @property
- def T(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __abs__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __add__(self, right, name='__add__', na_op=na_op):
- raise NotImplementedError("Not Yet implemented.")
-
- def __and__(self, other):
- raise NotImplementedError("Not Yet implemented.")
-
- def __array__(self, result=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def __array_prepare__(self, result, context=None):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def __array_priority__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __array_wrap__(self, result, context=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def __bool__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __bytes__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __class__(self, data=None, index=None, dtype=None, name=None,
- copy=False, fastpath=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def __contains__(self, key):
- raise NotImplementedError("Not Yet implemented.")
-
- def __copy__(self, deep=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def __deepcopy__(self, memo=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def __delitem__(self, key):
- raise NotImplementedError("Not Yet implemented.")
-
- def __dir__(self):
- return list(type(self).__dict__.keys())
-
- def __div__(self, right, name='__truediv__', na_op=na_op):
- raise NotImplementedError("Not Yet implemented.")
-
- def __divmod__(self, right, name='__divmod__', na_op=na_op):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def __doc__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __eq__(self, other, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def __finalize__(self, other, method=None, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def __float__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __floordiv__(self, right, name='__floordiv__', na_op=na_op):
- raise NotImplementedError("Not Yet implemented.")
-
- def __ge__(self, other, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def __getitem__(self, key):
- raise NotImplementedError("Not Yet implemented.")
-
- def __getstate__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __gt__(self, other, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def __iadd__(self, other):
- raise NotImplementedError("Not Yet implemented.")
-
- def __imul__(self, other):
- raise NotImplementedError("Not Yet implemented.")
-
- def __int__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __invert__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __ipow__(self, other):
- raise NotImplementedError("Not Yet implemented.")
-
- def __isub__(self, other):
- raise NotImplementedError("Not Yet implemented.")
-
- def __iter__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __itruediv__(self, other):
- raise NotImplementedError("Not Yet implemented.")
-
- def __le__(self, other, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def __len__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __long__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __lt__(self, other, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def __mod__(self, right, name='__mod__', na_op=na_op):
- raise NotImplementedError("Not Yet implemented.")
-
- def __mul__(self, right, name='__mul__', na_op=na_op):
- raise NotImplementedError("Not Yet implemented.")
-
- def __ne__(self, other, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def __neg__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __nonzero__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __or__(self, other):
- raise NotImplementedError("Not Yet implemented.")
-
- def __pow__(self, right, name='__pow__', na_op=na_op):
- raise NotImplementedError("Not Yet implemented.")
-
- def __repr__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __round__(self, decimals=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def __setitem__(self, key, value):
- raise NotImplementedError("Not Yet implemented.")
-
- def __setstate__(self, state):
- raise NotImplementedError("Not Yet implemented.")
-
- def __sizeof__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __str__(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def __sub__(self, right, name='__sub__', na_op=na_op):
- raise NotImplementedError("Not Yet implemented.")
-
- def __truediv__(self, right, name='__truediv__', na_op=na_op):
- raise NotImplementedError("Not Yet implemented.")
-
- def __xor__(self, other):
- raise NotImplementedError("Not Yet implemented.")
-
- def abs(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def add(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def add_prefix(self, prefix):
- raise NotImplementedError("Not Yet implemented.")
-
- def add_suffix(self, suffix):
- raise NotImplementedError("Not Yet implemented.")
-
- def agg(self, func, axis=0, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def aggregate(self, func, axis=0, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def align(self, other, join='outer', axis=None, level=None, copy=True,
- fill_value=None, method=None, limit=None, fill_axis=0,
- broadcast_axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def all(self, axis=None, bool_only=None, skipna=None, level=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def any(self, axis=None, bool_only=None, skipna=None, level=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def append(self, to_append, ignore_index=False, verify_integrity=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def apply(self, func, convert_dtype=True, args=(), **kwds):
- raise NotImplementedError("Not Yet implemented.")
-
- def argmax(self, axis=None, skipna=True, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def argmin(self, axis=None, skipna=True, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def argsort(self, axis=0, kind='quicksort', order=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def as_blocks(self, copy=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def as_matrix(self, columns=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def asfreq(self, freq, method=None, how=None, normalize=False,
- fill_value=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def asof(self, where, subset=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def astype(self, dtype, copy=True, errors='raise', **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def at(self, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def at_time(self, time, asof=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def autocorr(self, lag=1):
- raise NotImplementedError("Not Yet implemented.")
-
- def between(self, left, right, inclusive=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def between_time(self, start_time, end_time, include_start=True,
- include_end=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def bool(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def clip(self, lower=None, upper=None, axis=None, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def clip_lower(self, threshold, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def clip_upper(self, threshold, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def combine(self, other, func, fill_value=np.nan):
- raise NotImplementedError("Not Yet implemented.")
-
- def combine_first(self, other):
- raise NotImplementedError("Not Yet implemented.")
-
- def compound(self, axis=None, skipna=None, level=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def compress(self, condition, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def consolidate(self, inplace=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def convert_objects(self, convert_dates=True, convert_numeric=False,
- convert_timedeltas=True, copy=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def copy(self, deep=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def corr(self, other, method='pearson', min_periods=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def count(self, level=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def cov(self, other, min_periods=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def cummax(self, axis=None, skipna=True, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def cummin(self, axis=None, skipna=True, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def cumprod(self, axis=None, skipna=True, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def cumsum(self, axis=None, skipna=True, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def describe(self, percentiles=None, include=None, exclude=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def diff(self, periods=1):
- raise NotImplementedError("Not Yet implemented.")
-
- def div(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def divide(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def dot(self, other):
- raise NotImplementedError("Not Yet implemented.")
-
- def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'):
- raise NotImplementedError("Not Yet implemented.")
-
- def drop_duplicates(self, keep='first', inplace=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def dropna(self, axis=0, inplace=False, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def duplicated(self, keep='first'):
- raise NotImplementedError("Not Yet implemented.")
-
- def eq(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def equals(self, other):
- raise NotImplementedError("Not Yet implemented.")
-
- def ewm(self, com=None, span=None, halflife=None, alpha=None,
- min_periods=0, freq=None, adjust=True, ignore_na=False, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def expanding(self, min_periods=1, freq=None, center=False, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def factorize(self, sort=False, na_sentinel=-1):
- raise NotImplementedError("Not Yet implemented.")
-
- def ffill(self, axis=None, inplace=False, limit=None, downcast=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def fillna(self, value=None, method=None, axis=None, inplace=False,
- limit=None, downcast=None, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def filter(self, items=None, like=None, regex=None, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def first(self, offset):
- raise NotImplementedError("Not Yet implemented.")
-
- def first_valid_index(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def floordiv(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def from_array(self, arr, index=None, name=None, dtype=None, copy=False,
- fastpath=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def from_csv(self, path, sep=',', parse_dates=True, header=None,
- index_col=0, encoding=None, infer_datetime_format=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def ge(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def get(self, key, default=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def get_dtype_counts(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def get_ftype_counts(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def get_value(self, label, takeable=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def get_values(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
- group_keys=True, squeeze=False, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def gt(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def head(self, n=5):
- raise NotImplementedError("Not Yet implemented.")
-
- def hist(self, by=None, ax=None, grid=True, xlabelsize=None, xrot=None,
- ylabelsize=None, yrot=None, figsize=None, bins=10, **kwds):
- raise NotImplementedError("Not Yet implemented.")
-
- def iat(self, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def idxmax(self, axis=None, skipna=True, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def idxmin(self, axis=None, skipna=True, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def iloc(self, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
- limit_direction='forward', downcast=None, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def isin(self, values):
- raise NotImplementedError("Not Yet implemented.")
-
- def isnull(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def item(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def items(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def iteritems(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def ix(self, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def keys(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def kurt(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def last(self, offset):
- raise NotImplementedError("Not Yet implemented.")
-
- def last_valid_index(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def le(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def loc(self, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def lt(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def mad(self, axis=None, skipna=None, level=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def map(self, arg, na_action=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None,
- try_cast=False, raise_on_error=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def max(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def mean(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def median(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def memory_usage(self, index=True, deep=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def min(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def mod(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def mode(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def mul(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def multiply(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def ne(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def nlargest(self, n=5, keep='first'):
- raise NotImplementedError("Not Yet implemented.")
-
- def nonzero(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def notnull(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def nsmallest(self, n=5, keep='first'):
- raise NotImplementedError("Not Yet implemented.")
-
- def nunique(self, dropna=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def pipe(self, func, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def plot(self, kind='line', ax=None, figsize=None, use_index=True,
- title=None, grid=None, legend=False, style=None, logx=False,
- logy=False, loglog=False, xticks=None, yticks=None, xlim=None,
- ylim=None, rot=None, fontsize=None, colormap=None, table=False,
- yerr=None, xerr=None, label=None, secondary_y=False, **kwds):
- raise NotImplementedError("Not Yet implemented.")
-
- def pop(self, item):
- raise NotImplementedError("Not Yet implemented.")
-
- def pow(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def prod(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def product(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def ptp(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def put(self, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def quantile(self, q=0.5, interpolation='linear'):
- raise NotImplementedError("Not Yet implemented.")
-
- def radd(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def rank(self, axis=0, method='average', numeric_only=None,
- na_option='keep', ascending=True, pct=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def ravel(self, order='C'):
- raise NotImplementedError("Not Yet implemented.")
-
- def rdiv(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def reindex(self, index=None, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def reindex_axis(self, labels, axis=0, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def reindex_like(self, other, method=None, copy=True, limit=None,
- tolerance=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def rename(self, index=None, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def rename_axis(self, mapper, axis=0, copy=True, inplace=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def reorder_levels(self, order):
- raise NotImplementedError("Not Yet implemented.")
-
- def repeat(self, repeats, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def replace(self, to_replace=None, value=None, inplace=False, limit=None,
- regex=False, method='pad', axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def resample(self, rule, how=None, axis=0, fill_method=None, closed=None,
- label=None, convention='start', kind=None, loffset=None,
- limit=None, base=0, on=None, level=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def reset_index(self, level=None, drop=False, name=None, inplace=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def reshape(self, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def rfloordiv(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def rmod(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def rmul(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def rolling(self, window, min_periods=None, freq=None, center=False,
- win_type=None, on=None, axis=0, closed=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def round(self, decimals=0, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def rpow(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def rsub(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def rtruediv(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def sample(self, n=None, frac=None, replace=False, weights=None,
- random_state=None, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def searchsorted(self, value, side='left', sorter=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def select(self, crit, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def sem(self, axis=None, skipna=None, level=None, ddof=1,
- numeric_only=None, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def set_axis(self, axis, labels):
- raise NotImplementedError("Not Yet implemented.")
-
- def set_value(self, label, value, takeable=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def shift(self, periods=1, freq=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def skew(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def slice_shift(self, periods=1, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
- kind='quicksort', na_position='last', sort_remaining=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def sort_values(self, axis=0, ascending=True, inplace=False,
- kind='quicksort', na_position='last'):
- raise NotImplementedError("Not Yet implemented.")
-
- def sortlevel(self, level=0, ascending=True, sort_remaining=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def squeeze(self, axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def std(self, axis=None, skipna=None, level=None, ddof=1,
- numeric_only=None, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def sub(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def subtract(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def sum(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def swapaxes(self, axis1, axis2, copy=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def swaplevel(self, i=-2, j=-1, copy=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def tail(self, n=5):
- raise NotImplementedError("Not Yet implemented.")
-
- def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_clipboard(self, excel=None, sep=None, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_csv(self, path=None, index=True, sep=',', na_rep='',
- float_format=None, header=False, index_label=None, mode='w',
- encoding=None, date_format=None, decimal='.'):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_dense(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_dict(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
- float_format=None, columns=None, header=True, index=True,
- index_label=None, startrow=0, startcol=0, engine=None,
- merge_cells=True, encoding=None, inf_rep='inf',
- verbose=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_frame(self, name=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_hdf(self, path_or_buf, key, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_json(self, path_or_buf=None, orient=None, date_format=None,
- double_precision=10, force_ascii=True, date_unit='ms',
- default_handler=None, lines=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_latex(self, buf=None, columns=None, col_space=None, header=True,
- index=True, na_rep='NaN', formatters=None, float_format=None,
- sparsify=None, index_names=True, bold_rows=False,
- column_format=None, longtable=None, escape=None,
- encoding=None, decimal='.', multicolumn=None,
- multicolumn_format=None, multirow=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_period(self, freq=None, copy=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_pickle(self, path, compression='infer'):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_sparse(self, kind='block', fill_value=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
- index=True, index_label=None, chunksize=None, dtype=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_string(self, buf=None, na_rep='NaN', float_format=None,
- header=True, index=True, length=False, dtype=False,
- name=False, max_rows=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_timestamp(self, freq=None, how='start', copy=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_xarray(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def tolist(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def transform(self, func, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def transpose(self, *args, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def truediv(self, other, level=None, fill_value=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def truncate(self, before=None, after=None, axis=None, copy=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def tshift(self, periods=1, freq=None, axis=0):
- raise NotImplementedError("Not Yet implemented.")
-
- def tz_convert(self, tz, axis=0, level=None, copy=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def tz_localize(self, tz, axis=0, level=None, copy=True,
- ambiguous='raise'):
- raise NotImplementedError("Not Yet implemented.")
-
- def unique(self):
- raise NotImplementedError("Not Yet implemented.")
-
- def unstack(self, level=-1, fill_value=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def upandasate(self, other):
- raise NotImplementedError("Not Yet implemented.")
-
- def valid(self, inplace=False, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def value_counts(self, normalize=False, sort=True, ascending=False,
- bins=None, dropna=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def var(self, axis=None, skipna=None, level=None, ddof=1,
- numeric_only=None, **kwargs):
- raise NotImplementedError("Not Yet implemented.")
-
- def view(self, dtype=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
- try_cast=False, raise_on_error=True):
- raise NotImplementedError("Not Yet implemented.")
-
- def xs(key, axis=0, level=None, drop_level=True):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def asobject(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def axes(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def base(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def blocks(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def data(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def dtype(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def dtypes(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def empty(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def flags(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def ftype(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def ftypes(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def hasnans(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def imag(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def index(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def is_copy(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def is_monotonic(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def is_monotonic_decreasing(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def is_monotonic_increasing(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def is_unique(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def itemsize(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def name(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def nbytes(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def ndim(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def real(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def shape(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def size(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def strides(self):
- raise NotImplementedError("Not Yet implemented.")
-
- @property
- def values(self):
- raise NotImplementedError("Not Yet implemented.")
diff --git a/python/ray/dataframe/test/test_concat.py b/python/ray/dataframe/test/test_concat.py
deleted file mode 100644
index 62e881d05..000000000
--- a/python/ray/dataframe/test/test_concat.py
+++ /dev/null
@@ -1,127 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import pytest
-import pandas
-import ray.dataframe as pd
-from ray.dataframe.utils import (
- to_pandas,
- from_pandas
-)
-
-
-@pytest.fixture
-def ray_df_equals_pandas(ray_df, pandas_df):
- return to_pandas(ray_df).sort_index().equals(pandas_df.sort_index())
-
-
-@pytest.fixture
-def generate_dfs():
- df = pandas.DataFrame({'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]})
-
- df2 = pandas.DataFrame({'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col6': [12, 13, 14, 15],
- 'col7': [0, 0, 0, 0]})
- return df, df2
-
-
-@pytest.fixture
-def generate_none_dfs():
- df = pandas.DataFrame({'col1': [0, 1, 2, 3],
- 'col2': [4, 5, None, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [None, None, None, None]})
-
- df2 = pandas.DataFrame({'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col6': [12, 13, 14, 15],
- 'col7': [0, 0, 0, 0]})
- return df, df2
-
-
-@pytest.fixture
-def test_df_concat():
- df, df2 = generate_dfs()
-
- assert(ray_df_equals_pandas(pd.concat([df, df2]),
- pandas.concat([df, df2])))
-
-
-def test_ray_concat():
- df, df2 = generate_dfs()
- ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
-
- assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2]),
- pandas.concat([df, df2]))
-
-
-def test_ray_concat_on_index():
- df, df2 = generate_dfs()
- ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
-
- assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='index'),
- pandas.concat([df, df2], axis='index'))
-
- assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='rows'),
- pandas.concat([df, df2], axis='rows'))
-
- assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=0),
- pandas.concat([df, df2], axis=0))
-
-
-def test_ray_concat_on_column():
- df, df2 = generate_dfs()
- ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
-
- assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=1),
- pandas.concat([df, df2], axis=1))
-
- assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis="columns"),
- pandas.concat([df, df2], axis="columns"))
-
-
-def test_invalid_axis_errors():
- df, df2 = generate_dfs()
- ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)
-
- with pytest.raises(ValueError):
- pd.concat([ray_df, ray_df2], axis=2)
-
-
-def test_mixed_concat():
- df, df2 = generate_dfs()
- df3 = df.copy()
-
- mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]
-
- assert(ray_df_equals_pandas(pd.concat(mixed_dfs),
- pandas.concat([df, df2, df3])))
-
-
-def test_mixed_inner_concat():
- df, df2 = generate_dfs()
- df3 = df.copy()
-
- mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]
-
- assert(ray_df_equals_pandas(pd.concat(mixed_dfs, join='inner'),
- pandas.concat([df, df2, df3], join='inner')))
-
-
-def test_mixed_none_concat():
- df, df2 = generate_none_dfs()
- df3 = df.copy()
-
- mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]
-
- assert(ray_df_equals_pandas(pd.concat(mixed_dfs),
- pandas.concat([df, df2, df3])))
diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py
deleted file mode 100644
index 39e2b0585..000000000
--- a/python/ray/dataframe/test/test_dataframe.py
+++ /dev/null
@@ -1,3353 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import pytest
-import numpy as np
-import pandas
-import pandas.util.testing as tm
-from pandas.tests.frame.common import TestData
-import ray.dataframe as pd
-from ray.dataframe.utils import to_pandas
-
-
-@pytest.fixture
-def ray_df_equals_pandas(ray_df, pandas_df):
- return to_pandas(ray_df).equals(pandas_df)
-
-
-@pytest.fixture
-def ray_series_equals_pandas(ray_series, pandas_series):
- return ray_series.equals(pandas_series)
-
-
-@pytest.fixture
-def ray_df_equals(ray_df1, ray_df2):
- return ray_df1.equals(ray_df2)
-
-
-@pytest.fixture
-def create_test_dataframe():
- return pd.DataFrame({'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]})
-
-
-def test_int_dataframe():
- frame_data = {'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]}
- pandas_df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
-
- testfuncs = [lambda x: x + 1,
- lambda x: str(x),
- lambda x: x * x,
- lambda x: x,
- lambda x: False]
-
- query_funcs = ['col1 < col2', 'col3 > col4', 'col1 == col2',
- '(col2 > col1) and (col1 < col3)']
-
- keys = ['col1',
- 'col2',
- 'col3',
- 'col4']
-
- filter_by = {'items': ['col1', 'col5'],
- 'regex': '4$|3$',
- 'like': 'col'}
-
- test_filter(ray_df, pandas_df, filter_by)
- test_index(ray_df, pandas_df)
- test_size(ray_df, pandas_df)
- test_ndim(ray_df, pandas_df)
- test_ftypes(ray_df, pandas_df)
- test_dtypes(ray_df, pandas_df)
- test_values(ray_df, pandas_df)
- test_axes(ray_df, pandas_df)
- test_shape(ray_df, pandas_df)
- test_add_prefix(ray_df, pandas_df)
- test_add_suffix(ray_df, pandas_df)
-
- for testfunc in testfuncs:
- test_applymap(ray_df, pandas_df, testfunc)
-
- test_copy(ray_df)
- test_sum(ray_df, pandas_df)
- test_prod(ray_df, pandas_df)
- test_product(ray_df, pandas_df)
- test_abs(ray_df, pandas_df)
- test_keys(ray_df, pandas_df)
- test_transpose(ray_df, pandas_df)
- test_round(ray_df, pandas_df)
- test_query(ray_df, pandas_df, query_funcs)
-
- test_mean(ray_df, pandas_df)
- test_var(ray_df, pandas_df)
- test_std(ray_df, pandas_df)
- test_median(ray_df, pandas_df)
-
- quantiles = [0.25, 0.5, 0.75, 0.66, 0.01]
-
- for q in quantiles:
- test_quantile(ray_df, pandas_df, q)
-
- test_describe(ray_df, pandas_df)
- test_diff(ray_df, pandas_df)
- test_rank(ray_df, pandas_df)
-
- test_all(ray_df, pandas_df)
- test_any(ray_df, pandas_df)
- test___getitem__(ray_df, pandas_df)
- test___neg__(ray_df, pandas_df)
- test___iter__(ray_df, pandas_df)
- test___abs__(ray_df, pandas_df)
- test___delitem__(ray_df, pandas_df)
- test___copy__(ray_df, pandas_df)
- test___deepcopy__(ray_df, pandas_df)
- test_bool(ray_df, pandas_df)
- test_count(ray_df, pandas_df)
- test_head(ray_df, pandas_df, 2)
- test_head(ray_df, pandas_df)
- test_tail(ray_df, pandas_df)
- test_idxmax(ray_df, pandas_df)
- test_idxmin(ray_df, pandas_df)
- test_pop(ray_df, pandas_df)
-
- test___len__(ray_df, pandas_df)
- test_first_valid_index(ray_df, pandas_df)
- test_last_valid_index(ray_df, pandas_df)
-
- for key in keys:
- test_get(ray_df, pandas_df, key)
-
- test_get_dtype_counts(ray_df, pandas_df)
- test_get_ftype_counts(ray_df, pandas_df)
- test_iterrows(ray_df, pandas_df)
- test_items(ray_df, pandas_df)
- test_iteritems(ray_df, pandas_df)
- test_itertuples(ray_df, pandas_df)
-
- test_max(ray_df, pandas_df)
- test_min(ray_df, pandas_df)
- test_notna(ray_df, pandas_df)
- test_notnull(ray_df, pandas_df)
- test_cummax(ray_df, pandas_df)
- test_cummin(ray_df, pandas_df)
- test_cumprod(ray_df, pandas_df)
- test_cumsum(ray_df, pandas_df)
- test_pipe(ray_df, pandas_df)
-
- # test_loc(ray_df, pandas_df)
- # test_iloc(ray_df, pandas_df)
-
- labels = ['a', 'b', 'c', 'd']
- test_set_axis(ray_df, pandas_df, labels, 0)
- test_set_axis(ray_df, pandas_df, labels, 'rows')
- labels.append('e')
- test_set_axis(ray_df, pandas_df, labels, 1)
- test_set_axis(ray_df, pandas_df, labels, 'columns')
-
- for key in keys:
- test_set_index(ray_df, pandas_df, key)
-
- test_reset_index(ray_df, pandas_df)
- test_reset_index(ray_df, pandas_df, inplace=True)
-
- for key in keys:
- test___contains__(ray_df, key, True)
- test___contains__(ray_df, "Not Exists", False)
-
- for key in keys:
- test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key])
- test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key])
- test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
- test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
-
- test___array__(ray_df, pandas_df)
-
- apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
- ['sum', 'sum']]
- for func in apply_agg_functions:
- test_apply(ray_df, pandas_df, func, 0)
- test_aggregate(ray_df, pandas_df, func, 0)
- test_agg(ray_df, pandas_df, func, 0)
- if not isinstance(func, list):
- test_agg(ray_df, pandas_df, func, 1)
- test_apply(ray_df, pandas_df, func, 1)
- test_aggregate(ray_df, pandas_df, func, 1)
- else:
- with pytest.raises(TypeError):
- test_agg(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_apply(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_aggregate(ray_df, pandas_df, func, 1)
-
- func = ['sum', lambda df: df.sum()]
- test_apply(ray_df, pandas_df, func, 0)
- test_aggregate(ray_df, pandas_df, func, 0)
- test_agg(ray_df, pandas_df, func, 0)
- with pytest.raises(TypeError):
- test_apply(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_aggregate(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_agg(ray_df, pandas_df, func, 1)
-
- test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1)
- test_apply(ray_df, pandas_df, lambda df: -df, 0)
- test_transform(ray_df, pandas_df)
-
-
-def test_float_dataframe():
- frame_data = {'col1': [0.0, 1.0, 2.0, 3.0],
- 'col2': [4.0, 5.0, 6.0, 7.0],
- 'col3': [8.0, 9.0, 10.0, 11.0],
- 'col4': [12.0, 13.0, 14.0, 15.0],
- 'col5': [0.0, 0.0, 0.0, 0.0]}
-
- pandas_df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
-
- testfuncs = [lambda x: x + 1,
- lambda x: str(x),
- lambda x: x * x,
- lambda x: x,
- lambda x: False]
-
- query_funcs = ['col1 < col2', 'col3 > col4', 'col1 == col2',
- '(col2 > col1) and (col1 < col3)']
-
- keys = ['col1',
- 'col2',
- 'col3',
- 'col4']
-
- filter_by = {'items': ['col1', 'col5'],
- 'regex': '4$|3$',
- 'like': 'col'}
-
- test_filter(ray_df, pandas_df, filter_by)
- test_index(ray_df, pandas_df)
- test_size(ray_df, pandas_df)
- test_ndim(ray_df, pandas_df)
- test_ftypes(ray_df, pandas_df)
- test_dtypes(ray_df, pandas_df)
- test_values(ray_df, pandas_df)
- test_axes(ray_df, pandas_df)
- test_shape(ray_df, pandas_df)
- test_add_prefix(ray_df, pandas_df)
- test_add_suffix(ray_df, pandas_df)
-
- for testfunc in testfuncs:
- test_applymap(ray_df, pandas_df, testfunc)
-
- test_copy(ray_df)
- test_sum(ray_df, pandas_df)
- test_prod(ray_df, pandas_df)
- test_product(ray_df, pandas_df)
- test_abs(ray_df, pandas_df)
- test_keys(ray_df, pandas_df)
- test_transpose(ray_df, pandas_df)
- test_round(ray_df, pandas_df)
- test_query(ray_df, pandas_df, query_funcs)
-
- test_mean(ray_df, pandas_df)
- # TODO Clear floating point error.
- # test_var(ray_df, pandas_df)
- test_std(ray_df, pandas_df)
- test_median(ray_df, pandas_df)
-
- quantiles = [0.25, 0.5, 0.75, 0.66, 0.01]
-
- for q in quantiles:
- test_quantile(ray_df, pandas_df, q)
-
- test_describe(ray_df, pandas_df)
- test_diff(ray_df, pandas_df)
- test_rank(ray_df, pandas_df)
-
- test_all(ray_df, pandas_df)
- test_any(ray_df, pandas_df)
- test___getitem__(ray_df, pandas_df)
- test___neg__(ray_df, pandas_df)
- test___iter__(ray_df, pandas_df)
- test___abs__(ray_df, pandas_df)
- test___delitem__(ray_df, pandas_df)
- test___copy__(ray_df, pandas_df)
- test___deepcopy__(ray_df, pandas_df)
- test_bool(ray_df, pandas_df)
- test_count(ray_df, pandas_df)
- test_head(ray_df, pandas_df, 3)
- test_head(ray_df, pandas_df)
- test_tail(ray_df, pandas_df)
- test_idxmax(ray_df, pandas_df)
- test_idxmin(ray_df, pandas_df)
- test_pop(ray_df, pandas_df)
- test_max(ray_df, pandas_df)
- test_min(ray_df, pandas_df)
- test_notna(ray_df, pandas_df)
- test_notnull(ray_df, pandas_df)
- test_cummax(ray_df, pandas_df)
- test_cummin(ray_df, pandas_df)
- test_cumprod(ray_df, pandas_df)
- test_cumsum(ray_df, pandas_df)
- test_pipe(ray_df, pandas_df)
-
- test___len__(ray_df, pandas_df)
- test_first_valid_index(ray_df, pandas_df)
- test_last_valid_index(ray_df, pandas_df)
-
- for key in keys:
- test_get(ray_df, pandas_df, key)
-
- test_get_dtype_counts(ray_df, pandas_df)
- test_get_ftype_counts(ray_df, pandas_df)
- test_iterrows(ray_df, pandas_df)
- test_items(ray_df, pandas_df)
- test_iteritems(ray_df, pandas_df)
- test_itertuples(ray_df, pandas_df)
-
- # test_loc(ray_df, pandas_df)
- # test_iloc(ray_df, pandas_df)
-
- labels = ['a', 'b', 'c', 'd']
- test_set_axis(ray_df, pandas_df, labels, 0)
- test_set_axis(ray_df, pandas_df, labels, 'rows')
- labels.append('e')
- test_set_axis(ray_df, pandas_df, labels, 1)
- test_set_axis(ray_df, pandas_df, labels, 'columns')
-
- for key in keys:
- test_set_index(ray_df, pandas_df, key)
- test_set_index(ray_df, pandas_df, key, inplace=True)
-
- test_reset_index(ray_df, pandas_df)
- test_reset_index(ray_df, pandas_df, inplace=True)
-
- for key in keys:
- test___contains__(ray_df, key, True)
- test___contains__(ray_df, "Not Exists", False)
-
- for key in keys:
- test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key])
- test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key])
- test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
- test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
-
- # TODO Nans are always not equal to each other, fix it
- # test___array__(ray_df, pandas_df)
-
- apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
- ['sum', 'sum']]
- for func in apply_agg_functions:
- test_apply(ray_df, pandas_df, func, 0)
- test_aggregate(ray_df, pandas_df, func, 0)
- test_agg(ray_df, pandas_df, func, 0)
- if not isinstance(func, list):
- test_agg(ray_df, pandas_df, func, 1)
- test_apply(ray_df, pandas_df, func, 1)
- test_aggregate(ray_df, pandas_df, func, 1)
- else:
- with pytest.raises(TypeError):
- test_agg(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_apply(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_aggregate(ray_df, pandas_df, func, 1)
-
- func = ['sum', lambda df: df.sum()]
- test_apply(ray_df, pandas_df, func, 0)
- test_aggregate(ray_df, pandas_df, func, 0)
- test_agg(ray_df, pandas_df, func, 0)
- with pytest.raises(TypeError):
- test_apply(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_aggregate(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_agg(ray_df, pandas_df, func, 1)
-
- test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1)
- test_apply(ray_df, pandas_df, lambda df: -df, 0)
- test_transform(ray_df, pandas_df)
-
-
-def test_mixed_dtype_dataframe():
- frame_data = {'col1': [1, 2, 3, 4],
- 'col2': [4, 5, 6, 7],
- 'col3': [8.0, 9.4, 10.1, 11.3],
- 'col4': ['a', 'b', 'c', 'd']}
-
- pandas_df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
-
- testfuncs = [lambda x: x + x,
- lambda x: str(x),
- lambda x: x,
- lambda x: False]
-
- query_funcs = ['col1 < col2', 'col1 == col2',
- '(col2 > col1) and (col1 < col3)']
-
- keys = ['col1',
- 'col2',
- 'col3',
- 'col4']
-
- filter_by = {'items': ['col1', 'col5'],
- 'regex': '4$|3$',
- 'like': 'col'}
-
- test_filter(ray_df, pandas_df, filter_by)
- test_index(ray_df, pandas_df)
- test_size(ray_df, pandas_df)
- test_ndim(ray_df, pandas_df)
- test_ftypes(ray_df, pandas_df)
- test_dtypes(ray_df, pandas_df)
- test_values(ray_df, pandas_df)
- test_axes(ray_df, pandas_df)
- test_shape(ray_df, pandas_df)
- test_add_prefix(ray_df, pandas_df)
- test_add_suffix(ray_df, pandas_df)
-
- for testfunc in testfuncs:
- test_applymap(ray_df, pandas_df, testfunc)
-
- test_copy(ray_df)
- test_sum(ray_df, pandas_df)
-
- with pytest.raises(TypeError):
- test_abs(ray_df, pandas_df)
- test___abs__(ray_df, pandas_df)
-
- test_keys(ray_df, pandas_df)
- test_transpose(ray_df, pandas_df)
- test_round(ray_df, pandas_df)
- test_query(ray_df, pandas_df, query_funcs)
-
- test_mean(ray_df, pandas_df)
- # TODO Clear floating point error.
- # test_var(ray_df, pandas_df)
- test_std(ray_df, pandas_df)
- test_median(ray_df, pandas_df)
-
- quantiles = [0.25, 0.5, 0.75, 0.66, 0.01]
-
- for q in quantiles:
- test_quantile(ray_df, pandas_df, q)
-
- test_describe(ray_df, pandas_df)
-
- # TODO Reolve once Pandas-20962 is resolved.
- # test_rank(ray_df, pandas_df)
-
- test_all(ray_df, pandas_df)
- test_any(ray_df, pandas_df)
- test___getitem__(ray_df, pandas_df)
-
- with pytest.raises(TypeError):
- test___neg__(ray_df, pandas_df)
-
- test___iter__(ray_df, pandas_df)
- test___delitem__(ray_df, pandas_df)
- test___copy__(ray_df, pandas_df)
- test___deepcopy__(ray_df, pandas_df)
- test_bool(ray_df, pandas_df)
- test_count(ray_df, pandas_df)
- test_head(ray_df, pandas_df, 2)
- test_head(ray_df, pandas_df)
- test_tail(ray_df, pandas_df)
-
- with pytest.raises(TypeError):
- test_idxmax(ray_df, pandas_df)
- with pytest.raises(TypeError):
- test_idxmin(ray_df, pandas_df)
-
- test_pop(ray_df, pandas_df)
- test_max(ray_df, pandas_df)
- test_min(ray_df, pandas_df)
- test_notna(ray_df, pandas_df)
- test_notnull(ray_df, pandas_df)
- test_pipe(ray_df, pandas_df)
-
- # TODO Fix pandas so that the behavior is correct
- # We discovered a bug where argmax does not always give the same result
- # depending on what your other dtypes are.
- # test_cummax(ray_df, pandas_df)
- # test_cummin(ray_df, pandas_df)
- # test_cumprod(ray_df, pandas_df)
- # test_cumsum(ray_df, pandas_df)
-
- test___len__(ray_df, pandas_df)
- test_first_valid_index(ray_df, pandas_df)
- test_last_valid_index(ray_df, pandas_df)
-
- for key in keys:
- test_get(ray_df, pandas_df, key)
-
- test_get_dtype_counts(ray_df, pandas_df)
- test_get_ftype_counts(ray_df, pandas_df)
- test_iterrows(ray_df, pandas_df)
- test_items(ray_df, pandas_df)
- test_iteritems(ray_df, pandas_df)
- test_itertuples(ray_df, pandas_df)
-
- # test_loc(ray_df, pandas_df)
- # test_iloc(ray_df, pandas_df)
-
- labels = ['a', 'b', 'c', 'd']
- test_set_axis(ray_df, pandas_df, labels, 0)
- test_set_axis(ray_df, pandas_df, labels, 'rows')
- test_set_axis(ray_df, pandas_df, labels, 1)
- test_set_axis(ray_df, pandas_df, labels, 'columns')
-
- for key in keys:
- test_set_index(ray_df, pandas_df, key)
- test_set_index(ray_df, pandas_df, key, inplace=True)
-
- test_reset_index(ray_df, pandas_df)
- test_reset_index(ray_df, pandas_df, inplace=True)
-
- for key in keys:
- test___contains__(ray_df, key, True)
- test___contains__(ray_df, "Not Exists", False)
-
- for key in keys:
- test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key])
- test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key])
- test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
- test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
-
- test___array__(ray_df, pandas_df)
-
- apply_agg_functions = ['sum', lambda df: df.sum()]
- for func in apply_agg_functions:
- test_apply(ray_df, pandas_df, func, 0)
- test_aggregate(ray_df, pandas_df, func, 0)
- test_agg(ray_df, pandas_df, func, 0)
-
- func = ['sum', lambda df: df.sum()]
- test_apply(ray_df, pandas_df, func, 0)
- test_aggregate(ray_df, pandas_df, func, 0)
- test_agg(ray_df, pandas_df, func, 0)
- with pytest.raises(TypeError):
- test_apply(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_aggregate(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_agg(ray_df, pandas_df, func, 1)
-
- test_transform(ray_df, pandas_df)
- test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1)
-
-
-def test_nan_dataframe():
- frame_data = {'col1': [1, 2, 3, np.nan],
- 'col2': [4, 5, np.nan, 7],
- 'col3': [8, np.nan, 10, 11],
- 'col4': [np.nan, 13, 14, 15]}
-
- pandas_df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
-
- testfuncs = [lambda x: x + x,
- lambda x: str(x),
- lambda x: x,
- lambda x: False]
-
- query_funcs = ['col1 < col2', 'col3 > col4', 'col1 == col2',
- '(col2 > col1) and (col1 < col3)']
-
- keys = ['col1',
- 'col2',
- 'col3',
- 'col4']
-
- filter_by = {'items': ['col1', 'col5'],
- 'regex': '4$|3$',
- 'like': 'col'}
-
- test_filter(ray_df, pandas_df, filter_by)
- test_index(ray_df, pandas_df)
- test_size(ray_df, pandas_df)
- test_ndim(ray_df, pandas_df)
- test_ftypes(ray_df, pandas_df)
- test_dtypes(ray_df, pandas_df)
- test_values(ray_df, pandas_df)
- test_axes(ray_df, pandas_df)
- test_shape(ray_df, pandas_df)
- test_add_prefix(ray_df, pandas_df)
- test_add_suffix(ray_df, pandas_df)
-
- for testfunc in testfuncs:
- test_applymap(ray_df, pandas_df, testfunc)
-
- test_copy(ray_df)
- test_sum(ray_df, pandas_df)
- test_abs(ray_df, pandas_df)
- test_keys(ray_df, pandas_df)
- test_transpose(ray_df, pandas_df)
- test_round(ray_df, pandas_df)
- test_query(ray_df, pandas_df, query_funcs)
-
- test_mean(ray_df, pandas_df)
- test_var(ray_df, pandas_df)
- test_std(ray_df, pandas_df)
- test_median(ray_df, pandas_df)
-
- quantiles = [0.25, 0.5, 0.75, 0.66, 0.01]
-
- for q in quantiles:
- test_quantile(ray_df, pandas_df, q)
-
- test_describe(ray_df, pandas_df)
- test_diff(ray_df, pandas_df)
- test_rank(ray_df, pandas_df)
-
- test_all(ray_df, pandas_df)
- test_any(ray_df, pandas_df)
- test___getitem__(ray_df, pandas_df)
- test___neg__(ray_df, pandas_df)
- test___iter__(ray_df, pandas_df)
- test___abs__(ray_df, pandas_df)
- test___delitem__(ray_df, pandas_df)
- test___copy__(ray_df, pandas_df)
- test___deepcopy__(ray_df, pandas_df)
- test_bool(ray_df, pandas_df)
- test_count(ray_df, pandas_df)
- test_head(ray_df, pandas_df, 2)
- test_head(ray_df, pandas_df)
- test_tail(ray_df, pandas_df)
- test_idxmax(ray_df, pandas_df)
- test_idxmin(ray_df, pandas_df)
- test_pop(ray_df, pandas_df)
- test_max(ray_df, pandas_df)
- test_min(ray_df, pandas_df)
- test_notna(ray_df, pandas_df)
- test_notnull(ray_df, pandas_df)
- test_cummax(ray_df, pandas_df)
- test_cummin(ray_df, pandas_df)
- test_cumprod(ray_df, pandas_df)
- test_cumsum(ray_df, pandas_df)
- test_pipe(ray_df, pandas_df)
-
- test___len__(ray_df, pandas_df)
- test_first_valid_index(ray_df, pandas_df)
- test_last_valid_index(ray_df, pandas_df)
-
- for key in keys:
- test_get(ray_df, pandas_df, key)
-
- test_get_dtype_counts(ray_df, pandas_df)
- test_get_ftype_counts(ray_df, pandas_df)
- test_iterrows(ray_df, pandas_df)
- test_items(ray_df, pandas_df)
- test_iteritems(ray_df, pandas_df)
- test_itertuples(ray_df, pandas_df)
-
- # test_loc(ray_df, pandas_df)
- # test_iloc(ray_df, pandas_df)
-
- labels = ['a', 'b', 'c', 'd']
- test_set_axis(ray_df, pandas_df, labels, 0)
- test_set_axis(ray_df, pandas_df, labels, 'rows')
- test_set_axis(ray_df, pandas_df, labels, 1)
- test_set_axis(ray_df, pandas_df, labels, 'columns')
-
- for key in keys:
- test_set_index(ray_df, pandas_df, key)
- test_set_index(ray_df, pandas_df, key, inplace=True)
-
- test_reset_index(ray_df, pandas_df)
- test_reset_index(ray_df, pandas_df, inplace=True)
-
- for key in keys:
- test___contains__(ray_df, key, True)
- test___contains__(ray_df, "Not Exists", False)
-
- for key in keys:
- test_insert(ray_df, pandas_df, 0, "New Column", ray_df[key])
- test_insert(ray_df, pandas_df, 0, "New Column", pandas_df[key])
- test_insert(ray_df, pandas_df, 1, "New Column", ray_df[key])
- test_insert(ray_df, pandas_df, 4, "New Column", ray_df[key])
-
- # TODO Nans are always not equal to each other, fix it
- # test___array__(ray_df, pandas_df)
-
- apply_agg_functions = ['sum', lambda df: df.sum(), ['sum', 'mean'],
- ['sum', 'sum']]
- for func in apply_agg_functions:
- test_apply(ray_df, pandas_df, func, 0)
- test_aggregate(ray_df, pandas_df, func, 0)
- test_agg(ray_df, pandas_df, func, 0)
- if not isinstance(func, list):
- test_agg(ray_df, pandas_df, func, 1)
- test_apply(ray_df, pandas_df, func, 1)
- test_aggregate(ray_df, pandas_df, func, 1)
- else:
- with pytest.raises(TypeError):
- test_agg(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_apply(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_aggregate(ray_df, pandas_df, func, 1)
-
- func = ['sum', lambda df: df.sum()]
- test_apply(ray_df, pandas_df, func, 0)
- test_aggregate(ray_df, pandas_df, func, 0)
- test_agg(ray_df, pandas_df, func, 0)
- with pytest.raises(TypeError):
- test_apply(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_aggregate(ray_df, pandas_df, func, 1)
- with pytest.raises(TypeError):
- test_agg(ray_df, pandas_df, func, 1)
-
- test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1)
- test_apply(ray_df, pandas_df, lambda df: -df, 0)
- test_transform(ray_df, pandas_df)
-
-
-def test_empty_df():
- df = pd.DataFrame(index=['a', 'b'])
- test_is_empty(df)
- tm.assert_index_equal(df.index, pd.Index(['a', 'b']))
- assert len(df.columns) == 0
-
- df = pd.DataFrame(columns=['a', 'b'])
- test_is_empty(df)
- assert len(df.index) == 0
- tm.assert_index_equal(df.columns, pd.Index(['a', 'b']))
-
- df = pd.DataFrame()
- test_is_empty(df)
- assert len(df.index) == 0
- assert len(df.columns) == 0
-
- df = pd.DataFrame(index=['a', 'b'])
- test_is_empty(df)
- tm.assert_index_equal(df.index, pd.Index(['a', 'b']))
- assert len(df.columns) == 0
-
- df = pd.DataFrame(columns=['a', 'b'])
- test_is_empty(df)
- assert len(df.index) == 0
- tm.assert_index_equal(df.columns, pd.Index(['a', 'b']))
-
- df = pd.DataFrame()
- test_is_empty(df)
- assert len(df.index) == 0
- assert len(df.columns) == 0
-
-
-@pytest.fixture
-def test_is_empty(df):
- assert df.size == 0 and df.empty
- assert df.shape[0] == 0 or df.shape[1] == 0
-
-
-def test_dense_nan_df():
- frame_data = [[np.nan, 2, np.nan, 0],
- [3, 4, np.nan, 1],
- [np.nan, np.nan, np.nan, 5]]
- ray_df = pd.DataFrame(frame_data, columns=list('ABCD'))
-
- pd_df = pandas.DataFrame(frame_data, columns=list('ABCD'))
-
- column_subsets = [list('AD'), list('BC'), list('CD')]
- row_subsets = [[0, 1], [0, 1, 2], [2, 0]]
-
- test_dropna(ray_df, pd_df)
- test_dropna_inplace(ray_df, pd_df)
- test_dropna_multiple_axes(ray_df, pd_df)
- test_dropna_multiple_axes_inplace(ray_df, pd_df)
- test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets)
- test_dropna_subset_error(ray_df)
-
-
-@pytest.fixture
-def test_inter_df_math(op, simple=False):
- frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7],
- "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}
-
- ray_df = pd.DataFrame(frame_data)
- pandas_df = pandas.DataFrame(frame_data)
-
- assert ray_df_equals_pandas(getattr(ray_df, op)(ray_df),
- getattr(pandas_df, op)(pandas_df))
- assert ray_df_equals_pandas(getattr(ray_df, op)(4),
- getattr(pandas_df, op)(4))
- assert ray_df_equals_pandas(getattr(ray_df, op)(4.0),
- getattr(pandas_df, op)(4.0))
-
- frame_data = {"A": [0, 2], "col1": [0, 19], "col2": [1, 1]}
- ray_df2 = pd.DataFrame(frame_data)
- pandas_df2 = pandas.DataFrame(frame_data)
-
- assert ray_df_equals_pandas(getattr(ray_df, op)(ray_df2),
- getattr(pandas_df, op)(pandas_df2))
-
- list_test = [0, 1, 2, 4]
-
- if not simple:
- assert ray_df_equals_pandas(getattr(ray_df, op)(list_test, axis=1),
- getattr(pandas_df, op)(list_test, axis=1))
-
- assert ray_df_equals_pandas(getattr(ray_df, op)(list_test, axis=0),
- getattr(pandas_df, op)(list_test, axis=0))
-
-
-@pytest.fixture
-def test_comparison_inter_ops(op):
- frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7],
- "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}
-
- ray_df = pd.DataFrame(frame_data)
- pandas_df = pandas.DataFrame(frame_data)
-
- assert ray_df_equals_pandas(getattr(ray_df, op)(ray_df),
- getattr(pandas_df, op)(pandas_df))
- assert ray_df_equals_pandas(getattr(ray_df, op)(4),
- getattr(pandas_df, op)(4))
- assert ray_df_equals_pandas(getattr(ray_df, op)(4.0),
- getattr(pandas_df, op)(4.0))
-
- frame_data = {"A": [0, 2], "col1": [0, 19], "col2": [1, 1]}
-
- ray_df2 = pd.DataFrame(frame_data)
- pandas_df2 = pandas.DataFrame(frame_data)
-
- assert ray_df_equals_pandas(getattr(ray_df2, op)(ray_df2),
- getattr(pandas_df2, op)(pandas_df2))
-
-
-@pytest.fixture
-def test_inter_df_math_right_ops(op):
- frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7],
- "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}
-
- ray_df = pd.DataFrame(frame_data)
- pandas_df = pandas.DataFrame(frame_data)
-
- assert ray_df_equals_pandas(getattr(ray_df, op)(4),
- getattr(pandas_df, op)(4))
- assert ray_df_equals_pandas(getattr(ray_df, op)(4.0),
- getattr(pandas_df, op)(4.0))
-
-
-@pytest.fixture
-def test_index(ray_df, pandas_df):
- assert ray_df.index.equals(pandas_df.index)
- ray_df_cp = ray_df.copy()
- pandas_df_cp = pandas_df.copy()
-
- ray_df_cp.index = [str(i) for i in ray_df_cp.index]
- pandas_df_cp.index = [str(i) for i in pandas_df_cp.index]
- assert ray_df_cp.index.equals(pandas_df_cp.index)
-
-
-@pytest.fixture
-def test_size(ray_df, pandas_df):
- assert ray_df.size == pandas_df.size
-
-
-@pytest.fixture
-def test_ndim(ray_df, pandas_df):
- assert ray_df.ndim == pandas_df.ndim
-
-
-@pytest.fixture
-def test_ftypes(ray_df, pandas_df):
- assert ray_df.ftypes.equals(pandas_df.ftypes)
-
-
-@pytest.fixture
-def test_dtypes(ray_df, pandas_df):
- assert ray_df.dtypes.equals(pandas_df.dtypes)
-
-
-@pytest.fixture
-def test_values(ray_df, pandas_df):
- np.testing.assert_equal(ray_df.values, pandas_df.values)
-
-
-@pytest.fixture
-def test_axes(ray_df, pandas_df):
- for ray_axis, pd_axis in zip(ray_df.axes, pandas_df.axes):
- assert (np.array_equal(ray_axis, pd_axis))
-
-
-@pytest.fixture
-def test_shape(ray_df, pandas_df):
- assert ray_df.shape == pandas_df.shape
-
-
-@pytest.fixture
-def test_add_prefix(ray_df, pandas_df):
- test_prefix = "TEST"
- new_ray_df = ray_df.add_prefix(test_prefix)
- new_pandas_df = pandas_df.add_prefix(test_prefix)
- assert new_ray_df.columns.equals(new_pandas_df.columns)
-
-
-@pytest.fixture
-def test_add_suffix(ray_df, pandas_df):
- test_suffix = "TEST"
- new_ray_df = ray_df.add_suffix(test_suffix)
- new_pandas_df = pandas_df.add_suffix(test_suffix)
-
- assert new_ray_df.columns.equals(new_pandas_df.columns)
-
-
-@pytest.fixture
-def test_applymap(ray_df, pandas_df, testfunc):
- new_ray_df = ray_df.applymap(testfunc)
- new_pandas_df = pandas_df.applymap(testfunc)
-
- assert ray_df_equals_pandas(new_ray_df, new_pandas_df)
-
-
-@pytest.fixture
-def test_copy(ray_df):
- new_ray_df = ray_df.copy()
-
- assert new_ray_df is not ray_df
- assert np.array_equal(new_ray_df._block_partitions,
- ray_df._block_partitions)
-
-
-@pytest.fixture
-def test_sum(ray_df, pandas_df):
- assert ray_df.sum().equals(pandas_df.sum())
-
-
-@pytest.fixture
-def test_abs(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.abs(), pandas_df.abs())
-
-
-@pytest.fixture
-def test_keys(ray_df, pandas_df):
- assert ray_df.keys().equals(pandas_df.keys())
-
-
-@pytest.fixture
-def test_transpose(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.T, pandas_df.T)
- assert ray_df_equals_pandas(ray_df.transpose(), pandas_df.transpose())
-
-
-@pytest.fixture
-def test_get(ray_df, pandas_df, key):
- assert ray_df.get(key).equals(pandas_df.get(key))
- assert ray_df.get(
- key, default='default').equals(pandas_df.get(key, default='default'))
-
-
-@pytest.fixture
-def test_get_dtype_counts(ray_df, pandas_df):
- assert ray_df.get_dtype_counts().equals(pandas_df.get_dtype_counts())
-
-
-@pytest.fixture
-def test_get_ftype_counts(ray_df, pandas_df):
- assert ray_df.get_ftype_counts().equals(pandas_df.get_ftype_counts())
-
-
-def test_add():
- test_inter_df_math("add", simple=False)
-
-
-@pytest.fixture
-def test_agg(ray_df, pandas_df, func, axis):
- ray_result = ray_df.agg(func, axis)
- pandas_result = pandas_df.agg(func, axis)
- if isinstance(ray_result, pd.DataFrame):
- assert ray_df_equals_pandas(ray_result, pandas_result)
- else:
- assert ray_result.equals(pandas_result)
-
-
-@pytest.fixture
-def test_aggregate(ray_df, pandas_df, func, axis):
- ray_result = ray_df.aggregate(func, axis)
- pandas_result = pandas_df.aggregate(func, axis)
- if isinstance(ray_result, pd.DataFrame):
- assert ray_df_equals_pandas(ray_result, pandas_result)
- else:
- assert ray_result.equals(pandas_result)
-
-
-def test_align():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.align(None)
-
-
-@pytest.fixture
-def test_all(ray_df, pd_df):
- assert pd_df.all().equals(ray_df.all())
- assert pd_df.all(axis=1).equals(ray_df.all(axis=1))
-
-
-@pytest.fixture
-def test_any(ray_df, pd_df):
- assert pd_df.any().equals(ray_df.any())
- assert pd_df.any(axis=1).equals(ray_df.any(axis=1))
-
-
-def test_append():
- frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7],
- "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}
-
- ray_df = pd.DataFrame(frame_data)
- pandas_df = pandas.DataFrame(frame_data)
-
- frame_data2 = {"col5": [0], "col6": [1]}
-
- ray_df2 = pd.DataFrame(frame_data2)
- pandas_df2 = pandas.DataFrame(frame_data2)
-
- assert ray_df_equals_pandas(ray_df.append(ray_df2),
- pandas_df.append(pandas_df2))
-
- with pytest.raises(ValueError):
- ray_df.append(ray_df2, verify_integrity=True)
-
-
-@pytest.fixture
-def test_apply(ray_df, pandas_df, func, axis):
- ray_result = ray_df.apply(func, axis)
- pandas_result = pandas_df.apply(func, axis)
- if isinstance(ray_result, pd.DataFrame):
- assert ray_df_equals_pandas(ray_result, pandas_result)
- else:
- assert ray_result.equals(pandas_result)
-
-
-def test_as_blocks():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.as_blocks()
-
-
-def test_as_matrix():
- test_data = TestData()
- frame = pd.DataFrame(test_data.frame)
- mat = frame.as_matrix()
-
- frame_columns = frame.columns
- for i, row in enumerate(mat):
- for j, value in enumerate(row):
- col = frame_columns[j]
- if np.isnan(value):
- assert np.isnan(frame[col][i])
- else:
- assert value == frame[col][i]
-
- # mixed type
- mat = pd.DataFrame(test_data.mixed_frame).as_matrix(['foo', 'A'])
- assert mat[0, 0] == 'bar'
-
- df = pd.DataFrame({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]})
- mat = df.as_matrix()
- assert mat[0, 0] == 1j
-
- # single block corner case
- mat = pd.DataFrame(test_data.frame).as_matrix(['A', 'B'])
- expected = test_data.frame.reindex(columns=['A', 'B']).values
- tm.assert_almost_equal(mat, expected)
-
-
-def test_asfreq():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.asfreq(None)
-
-
-def test_asof():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.asof(None)
-
-
-def test_assign():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.assign()
-
-
-def test_astype():
- td = TestData()
- ray_df = pd.DataFrame(td.frame)
- our_df_casted = ray_df.astype(np.int32)
- expected_df_casted = pandas.DataFrame(td.frame.values.astype(np.int32),
- index=td.frame.index,
- columns=td.frame.columns)
-
- assert ray_df_equals_pandas(our_df_casted, expected_df_casted)
-
- our_df_casted = ray_df.astype(np.float64)
- expected_df_casted = pandas.DataFrame(td.frame.values.astype(np.float64),
- index=td.frame.index,
- columns=td.frame.columns)
-
- assert ray_df_equals_pandas(our_df_casted, expected_df_casted)
-
- our_df_casted = ray_df.astype(str)
- expected_df_casted = pandas.DataFrame(td.frame.values.astype(str),
- index=td.frame.index,
- columns=td.frame.columns)
-
- assert ray_df_equals_pandas(our_df_casted, expected_df_casted)
-
-
-def test_at_time():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.at_time(None)
-
-
-def test_between_time():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.between_time(None, None)
-
-
-@pytest.fixture
-def test_bfill():
- test_data = TestData()
- test_data.tsframe['A'][:5] = np.nan
- test_data.tsframe['A'][-5:] = np.nan
- ray_df = pd.DataFrame(test_data.tsframe)
- assert ray_df_equals_pandas(ray_df.bfill(), test_data.tsframe.bfill())
-
-
-@pytest.fixture
-def test_bool(ray_df, pd_df):
- with pytest.raises(ValueError):
- ray_df.bool()
- pd_df.bool()
-
- single_bool_pd_df = pandas.DataFrame([True])
- single_bool_ray_df = pd.DataFrame([True])
-
- assert single_bool_pd_df.bool() == single_bool_ray_df.bool()
-
-
-def test_boxplot():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.boxplot()
-
-
-def test_clip():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.clip()
-
-
-def test_clip_lower():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.clip_lower(None)
-
-
-def test_clip_upper():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.clip_upper(None)
-
-
-def test_combine():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.combine(None, None)
-
-
-def test_combine_first():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.combine_first(None)
-
-
-def test_compound():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.compound()
-
-
-def test_consolidate():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.consolidate()
-
-
-def test_convert_objects():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.convert_objects()
-
-
-def test_corr():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.corr()
-
-
-def test_corrwith():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.corrwith(None)
-
-
-@pytest.fixture
-def test_count(ray_df, pd_df):
- assert ray_df.count().equals(pd_df.count())
- assert ray_df.count(axis=1).equals(pd_df.count(axis=1))
-
-
-def test_cov():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.cov()
-
-
-@pytest.fixture
-def test_cummax(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.cummax(), pandas_df.cummax())
-
-
-@pytest.fixture
-def test_cummin(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.cummin(), pandas_df.cummin())
-
-
-@pytest.fixture
-def test_cumprod(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.cumprod(), pandas_df.cumprod())
-
-
-@pytest.fixture
-def test_cumsum(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.cumsum(), pandas_df.cumsum())
-
-
-@pytest.fixture
-def test_describe(ray_df, pandas_df):
- assert ray_df.describe().equals(pandas_df.describe())
-
-
-@pytest.fixture
-def test_diff(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.diff(), pandas_df.diff())
- assert ray_df_equals_pandas(ray_df.diff(axis=1), pandas_df.diff(axis=1))
- assert ray_df_equals_pandas(ray_df.diff(periods=1),
- pandas_df.diff(periods=1))
-
-
-def test_div():
- test_inter_df_math("div", simple=False)
-
-
-def test_divide():
- test_inter_df_math("divide", simple=False)
-
-
-def test_dot():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.dot(None)
-
-
-def test_drop():
- frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}
- simple = pandas.DataFrame(frame_data)
- ray_simple = pd.DataFrame(frame_data)
- assert ray_df_equals_pandas(ray_simple.drop("A", axis=1), simple[['B']])
- assert ray_df_equals_pandas(ray_simple.drop(["A", "B"], axis='columns'),
- simple[[]])
- assert ray_df_equals_pandas(ray_simple.drop([0, 1, 3], axis=0),
- simple.loc[[2], :])
- assert ray_df_equals_pandas(ray_simple.drop([0, 3], axis='index'),
- simple.loc[[1, 2], :])
-
- pytest.raises(ValueError, ray_simple.drop, 5)
- pytest.raises(ValueError, ray_simple.drop, 'C', 1)
- pytest.raises(ValueError, ray_simple.drop, [1, 5])
- pytest.raises(ValueError, ray_simple.drop, ['A', 'C'], 1)
-
- # errors = 'ignore'
- assert ray_df_equals_pandas(ray_simple.drop(5, errors='ignore'), simple)
- assert ray_df_equals_pandas(ray_simple.drop([0, 5], errors='ignore'),
- simple.loc[[1, 2, 3], :])
- assert ray_df_equals_pandas(ray_simple.drop('C', axis=1, errors='ignore'),
- simple)
- assert ray_df_equals_pandas(ray_simple.drop(['A', 'C'], axis=1,
- errors='ignore'),
- simple[['B']])
-
- # non-unique
- nu_df = pandas.DataFrame(pandas.compat.lzip(range(3), range(-3, 1),
- list('abc')),
- columns=['a', 'a', 'b'])
- ray_nu_df = pd.DataFrame(nu_df)
- assert ray_df_equals_pandas(ray_nu_df.drop('a', axis=1), nu_df[['b']])
- assert ray_df_equals_pandas(ray_nu_df.drop('b', axis='columns'),
- nu_df['a'])
- assert ray_df_equals_pandas(ray_nu_df.drop([]), nu_df)
-
- nu_df = nu_df.set_index(pandas.Index(['X', 'Y', 'X']))
- nu_df.columns = list('abc')
- ray_nu_df = pd.DataFrame(nu_df)
- assert ray_df_equals_pandas(ray_nu_df.drop('X', axis='rows'),
- nu_df.loc[["Y"], :])
- assert ray_df_equals_pandas(ray_nu_df.drop(['X', 'Y'], axis=0),
- nu_df.loc[[], :])
-
- # inplace cache issue
- frame_data = np.random.randn(10, 3)
- df = pandas.DataFrame(frame_data, columns=list('abc'))
- ray_df = pd.DataFrame(frame_data, columns=list('abc'))
- expected = df[~(df.b > 0)]
- ray_df.drop(labels=df[df.b > 0].index, inplace=True)
- assert ray_df_equals_pandas(ray_df, expected)
-
-
-def test_drop_api_equivalence():
- # equivalence of the labels/axis and index/columns API's
- frame_data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]]
-
- ray_df = pd.DataFrame(frame_data, index=['a', 'b', 'c'],
- columns=['d', 'e', 'f'])
-
- ray_df1 = ray_df.drop('a')
- ray_df2 = ray_df.drop(index='a')
- assert ray_df_equals(ray_df1, ray_df2)
-
- ray_df1 = ray_df.drop('d', 1)
- ray_df2 = ray_df.drop(columns='d')
- assert ray_df_equals(ray_df1, ray_df2)
-
- ray_df1 = ray_df.drop(labels='e', axis=1)
- ray_df2 = ray_df.drop(columns='e')
- assert ray_df_equals(ray_df1, ray_df2)
-
- ray_df1 = ray_df.drop(['a'], axis=0)
- ray_df2 = ray_df.drop(index=['a'])
- assert ray_df_equals(ray_df1, ray_df2)
-
- ray_df1 = ray_df.drop(['a'], axis=0).drop(['d'], axis=1)
- ray_df2 = ray_df.drop(index=['a'], columns=['d'])
- assert ray_df_equals(ray_df1, ray_df2)
-
- with pytest.raises(ValueError):
- ray_df.drop(labels='a', index='b')
-
- with pytest.raises(ValueError):
- ray_df.drop(labels='a', columns='b')
-
- with pytest.raises(ValueError):
- ray_df.drop(axis=1)
-
-
-def test_drop_duplicates():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.drop_duplicates()
-
-
-@pytest.fixture
-def test_dropna(ray_df, pd_df):
- assert ray_df_equals_pandas(ray_df.dropna(axis=1, how='all'),
- pd_df.dropna(axis=1, how='all'))
-
- assert ray_df_equals_pandas(ray_df.dropna(axis=1, how='any'),
- pd_df.dropna(axis=1, how='any'))
-
- assert ray_df_equals_pandas(ray_df.dropna(axis=0, how='all'),
- pd_df.dropna(axis=0, how='all'))
-
- assert ray_df_equals_pandas(ray_df.dropna(thresh=2),
- pd_df.dropna(thresh=2))
-
-
-@pytest.fixture
-def test_dropna_inplace(ray_df, pd_df):
- ray_df = ray_df.copy()
- pd_df = pd_df.copy()
-
- ray_df.dropna(thresh=2, inplace=True)
- pd_df.dropna(thresh=2, inplace=True)
-
- assert ray_df_equals_pandas(ray_df, pd_df)
-
- ray_df.dropna(axis=1, how='any', inplace=True)
- pd_df.dropna(axis=1, how='any', inplace=True)
-
- assert ray_df_equals_pandas(ray_df, pd_df)
-
-
-@pytest.fixture
-def test_dropna_multiple_axes(ray_df, pd_df):
- assert ray_df_equals_pandas(ray_df.dropna(how='all', axis=[0, 1]),
- pd_df.dropna(how='all', axis=[0, 1]))
- assert ray_df_equals_pandas(ray_df.dropna(how='all', axis=(0, 1)),
- pd_df.dropna(how='all', axis=(0, 1)))
-
-
-@pytest.fixture
-def test_dropna_multiple_axes_inplace(ray_df, pd_df):
- ray_df_copy = ray_df.copy()
- pd_df_copy = pd_df.copy()
-
- ray_df_copy.dropna(how='all', axis=[0, 1], inplace=True)
- pd_df_copy.dropna(how='all', axis=[0, 1], inplace=True)
-
- assert ray_df_equals_pandas(ray_df_copy, pd_df_copy)
-
- ray_df_copy = ray_df.copy()
- pd_df_copy = pd_df.copy()
-
- ray_df_copy.dropna(how='all', axis=(0, 1), inplace=True)
- pd_df_copy.dropna(how='all', axis=(0, 1), inplace=True)
-
- assert ray_df_equals_pandas(ray_df_copy, pd_df_copy)
-
-
-@pytest.fixture
-def test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets):
- for subset in column_subsets:
- assert ray_df_equals_pandas(ray_df.dropna(how='all', subset=subset),
- pd_df.dropna(how='all', subset=subset))
-
- assert ray_df_equals_pandas(ray_df.dropna(how='any', subset=subset),
- pd_df.dropna(how='any', subset=subset))
-
- for subset in row_subsets:
- assert ray_df_equals_pandas(
- ray_df.dropna(how='all', axis=1, subset=subset),
- pd_df.dropna(how='all', axis=1, subset=subset))
-
- assert ray_df_equals_pandas(
- ray_df.dropna(how='any', axis=1, subset=subset),
- pd_df.dropna(how='any', axis=1, subset=subset))
-
-
-@pytest.fixture
-def test_dropna_subset_error(ray_df):
- with pytest.raises(KeyError):
- ray_df.dropna(subset=list('EF'))
-
- with pytest.raises(KeyError):
- ray_df.dropna(axis=1, subset=[4, 5])
-
-
-def test_duplicated():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.duplicated()
-
-
-def test_eq():
- test_comparison_inter_ops("eq")
-
-
-def test_equals():
- frame_data = {'col1': [2.9, 3, 3, 3], 'col2': [2, 3, 4, 1]}
- ray_df1 = pd.DataFrame(frame_data)
- ray_df2 = pd.DataFrame(frame_data)
-
- assert ray_df1.equals(ray_df2)
-
- frame_data = {'col1': [2.9, 3, 3, 3], 'col2': [2, 3, 5, 1]}
- ray_df3 = pd.DataFrame(frame_data)
-
- assert not ray_df3.equals(ray_df1)
- assert not ray_df3.equals(ray_df2)
-
-
-def test_eval_df_use_case():
- frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)}
- df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
- df.eval("e = arctan2(sin(a), b)",
- engine='python', parser='pandas', inplace=True)
- ray_df.eval("e = arctan2(sin(a), b)",
- engine='python',
- parser='pandas', inplace=True)
- # TODO: Use a series equality validator.
- assert ray_df_equals_pandas(ray_df, df)
-
-
-def test_eval_df_arithmetic_subexpression():
- frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)}
- df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
- df.eval("not_e = sin(a + b)",
- engine='python', parser='pandas', inplace=True)
- ray_df.eval("not_e = sin(a + b)",
- engine='python', parser='pandas', inplace=True)
- # TODO: Use a series equality validator.
- assert ray_df_equals_pandas(ray_df, df)
-
-
-def test_ewm():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.ewm()
-
-
-def test_expanding():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.expanding()
-
-
-@pytest.fixture
-def test_ffill():
- test_data = TestData()
- test_data.tsframe['A'][:5] = np.nan
- test_data.tsframe['A'][-5:] = np.nan
- ray_df = pd.DataFrame(test_data.tsframe)
-
- assert ray_df_equals_pandas(ray_df.ffill(), test_data.tsframe.ffill())
-
-
-def test_fillna():
- test_fillna_sanity()
- test_fillna_downcast()
- test_ffill()
- test_ffill2()
- test_bfill()
- test_bfill2()
- test_fillna_inplace()
- # test_frame_fillna_limit()
- # test_frame_pad_backfill_limit()
- test_fillna_dtype_conversion()
- test_fillna_skip_certain_blocks()
- test_fillna_dict_series()
-
- with pytest.raises(NotImplementedError):
- test_fillna_dataframe()
-
- test_fillna_columns()
- test_fillna_invalid_method()
- test_fillna_invalid_value()
- test_fillna_col_reordering()
-
-
-@pytest.fixture
-def test_fillna_sanity():
- test_data = TestData()
- tf = test_data.tsframe
- tf.loc[tf.index[:5], 'A'] = np.nan
- tf.loc[tf.index[-5:], 'A'] = np.nan
-
- zero_filled = test_data.tsframe.fillna(0)
- ray_df = pd.DataFrame(test_data.tsframe).fillna(0)
- assert ray_df_equals_pandas(ray_df, zero_filled)
-
- padded = test_data.tsframe.fillna(method='pad')
- ray_df = pd.DataFrame(test_data.tsframe).fillna(method='pad')
- assert ray_df_equals_pandas(ray_df, padded)
-
- # mixed type
- mf = test_data.mixed_frame
- mf.loc[mf.index[5:20], 'foo'] = np.nan
- mf.loc[mf.index[-10:], 'A'] = np.nan
-
- result = test_data.mixed_frame.fillna(value=0)
- ray_df = pd.DataFrame(test_data.mixed_frame).fillna(value=0)
- assert ray_df_equals_pandas(ray_df, result)
-
- result = test_data.mixed_frame.fillna(method='pad')
- ray_df = pd.DataFrame(test_data.mixed_frame).fillna(method='pad')
- assert ray_df_equals_pandas(ray_df, result)
-
- pytest.raises(ValueError, test_data.tsframe.fillna)
- pytest.raises(ValueError, pd.DataFrame(test_data.tsframe).fillna)
- with pytest.raises(ValueError):
- pd.DataFrame(test_data.tsframe).fillna(5, method='ffill')
-
- # mixed numeric (but no float16)
- mf = test_data.mixed_float.reindex(columns=['A', 'B', 'D'])
- mf.loc[mf.index[-10:], 'A'] = np.nan
- result = mf.fillna(value=0)
- ray_df = pd.DataFrame(mf).fillna(value=0)
- assert ray_df_equals_pandas(ray_df, result)
-
- result = mf.fillna(method='pad')
- ray_df = pd.DataFrame(mf).fillna(method='pad')
- assert ray_df_equals_pandas(ray_df, result)
-
- # TODO: Use this when Arrow issue resolves:
- # (https://issues.apache.org/jira/browse/ARROW-2122)
- # empty frame
- # df = DataFrame(columns=['x'])
- # for m in ['pad', 'backfill']:
- # df.x.fillna(method=m, inplace=True)
- # df.x.fillna(method=m)
-
- # with different dtype
- frame_data = [['a', 'a', np.nan, 'a'],
- ['b', 'b', np.nan, 'b'],
- ['c', 'c', np.nan, 'c']]
- df = pandas.DataFrame(frame_data)
-
- result = df.fillna({2: 'foo'})
- ray_df = pd.DataFrame(frame_data).fillna({2: 'foo'})
-
- assert ray_df_equals_pandas(ray_df, result)
-
- ray_df = pd.DataFrame(df)
- df.fillna({2: 'foo'}, inplace=True)
- ray_df.fillna({2: 'foo'}, inplace=True)
- assert ray_df_equals_pandas(ray_df, result)
-
- frame_data = {'Date': [pandas.NaT, pandas.Timestamp("2014-1-1")],
- 'Date2': [pandas.Timestamp("2013-1-1"), pandas.NaT]}
- df = pandas.DataFrame(frame_data)
- result = df.fillna(value={'Date': df['Date2']})
- ray_df = pd.DataFrame(frame_data).fillna(value={'Date': df['Date2']})
- assert ray_df_equals_pandas(ray_df, result)
-
- # TODO: Use this when Arrow issue resolves:
- # (https://issues.apache.org/jira/browse/ARROW-2122)
- # with timezone
- """
- frame_data = {'A': [pandas.Timestamp('2012-11-11 00:00:00+01:00'),
- pandas.NaT]}
- df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
- assert ray_df_equals_pandas(ray_df.fillna(method='pad'),
- df.fillna(method='pad'))
-
- frame_data = {'A': [pandas.NaT,
- pandas.Timestamp('2012-11-11 00:00:00+01:00')]}
- df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data).fillna(method='bfill')
- assert ray_df_equals_pandas(ray_df, df.fillna(method='bfill'))
- """
-
-
-@pytest.fixture
-def test_fillna_downcast():
- # infer int64 from float64
- frame_data = {'a': [1., np.nan]}
- df = pandas.DataFrame(frame_data)
- result = df.fillna(0, downcast='infer')
- ray_df = pd.DataFrame(frame_data).fillna(0, downcast='infer')
- assert ray_df_equals_pandas(ray_df, result)
-
- # infer int64 from float64 when fillna value is a dict
- df = pandas.DataFrame(frame_data)
- result = df.fillna({'a': 0}, downcast='infer')
- ray_df = pd.DataFrame(frame_data).fillna({'a': 0}, downcast='infer')
- assert ray_df_equals_pandas(ray_df, result)
-
-
-@pytest.fixture
-def test_ffill2():
- test_data = TestData()
- test_data.tsframe['A'][:5] = np.nan
- test_data.tsframe['A'][-5:] = np.nan
- ray_df = pd.DataFrame(test_data.tsframe)
- assert ray_df_equals_pandas(ray_df.fillna(method='ffill'),
- test_data.tsframe.fillna(method='ffill'))
-
-
-@pytest.fixture
-def test_bfill2():
- test_data = TestData()
- test_data.tsframe['A'][:5] = np.nan
- test_data.tsframe['A'][-5:] = np.nan
- ray_df = pd.DataFrame(test_data.tsframe)
- assert ray_df_equals_pandas(ray_df.fillna(method='bfill'),
- test_data.tsframe.fillna(method='bfill'))
-
-
-@pytest.fixture
-def test_fillna_inplace():
- frame_data = np.random.randn(10, 4)
- df = pandas.DataFrame(frame_data)
- df[1][:4] = np.nan
- df[3][-4:] = np.nan
-
- ray_df = pd.DataFrame(df)
- df.fillna(value=0, inplace=True)
- assert not ray_df_equals_pandas(ray_df, df)
-
- ray_df.fillna(value=0, inplace=True)
- assert ray_df_equals_pandas(ray_df, df)
-
- ray_df = pd.DataFrame(df).fillna(value={0: 0}, inplace=True)
- assert ray_df is None
-
- df[1][:4] = np.nan
- df[3][-4:] = np.nan
- ray_df = pd.DataFrame(df)
- df.fillna(method='ffill', inplace=True)
-
- assert not ray_df_equals_pandas(ray_df, df)
-
- ray_df.fillna(method='ffill', inplace=True)
- assert ray_df_equals_pandas(ray_df, df)
-
-
-@pytest.fixture
-def test_frame_fillna_limit():
- index = np.arange(10)
- frame_data = np.random.randn(10, 4)
- df = pandas.DataFrame(frame_data, index=index)
-
- expected = df[:2].reindex(index)
- expected = expected.fillna(method='pad', limit=5)
-
- ray_df = pd.DataFrame(df[:2].reindex(index)).fillna(method='pad', limit=5)
- assert ray_df_equals_pandas(ray_df, expected)
-
- expected = df[-2:].reindex(index)
- expected = expected.fillna(method='backfill', limit=5)
- ray_df = pd.DataFrame(df[-2:].reindex(index)).fillna(method='backfill',
- limit=5)
- assert ray_df_equals_pandas(ray_df, expected)
-
-
-@pytest.fixture
-def test_frame_pad_backfill_limit():
- index = np.arange(10)
- frame_data = np.random.randn(10, 4)
- df = pandas.DataFrame(frame_data, index=index)
-
- result = df[:2].reindex(index)
- ray_df = pd.DataFrame(result)
- assert ray_df_equals_pandas(ray_df.fillna(method='pad', limit=5),
- result.fillna(method='pad', limit=5))
-
- result = df[-2:].reindex(index)
- ray_df = pd.DataFrame(result)
- assert ray_df_equals_pandas(ray_df.fillna(method='backfill', limit=5),
- result.fillna(method='backfill', limit=5))
-
-
-@pytest.fixture
-def test_fillna_dtype_conversion():
- # make sure that fillna on an empty frame works
- df = pandas.DataFrame(index=range(3), columns=['A', 'B'], dtype='float64')
- ray_df = pd.DataFrame(index=range(3), columns=['A', 'B'], dtype='float64')
- assert ray_df_equals_pandas(ray_df.fillna('nan'), df.fillna('nan'))
-
- frame_data = {'A': [1, np.nan], 'B': [1., 2.]}
- df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
- for v in ['', 1, np.nan, 1.0]:
- assert ray_df_equals_pandas(ray_df.fillna(v), df.fillna(v))
-
-
-@pytest.fixture
-def test_fillna_skip_certain_blocks():
- # don't try to fill boolean, int blocks
-
- frame_data = np.random.randn(10, 4).astype(int)
- df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
-
- assert ray_df_equals_pandas(ray_df.fillna(np.nan), df.fillna(np.nan))
-
-
-@pytest.fixture
-def test_fillna_dict_series():
- frame_data = {'a': [np.nan, 1, 2, np.nan, np.nan],
- 'b': [1, 2, 3, np.nan, np.nan],
- 'c': [np.nan, 1, 2, 3, 4]}
- df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
-
- assert ray_df_equals_pandas(ray_df.fillna({'a': 0, 'b': 5}),
- df.fillna({'a': 0, 'b': 5}))
-
- assert ray_df_equals_pandas(ray_df.fillna({'a': 0, 'b': 5, 'd': 7}),
- df.fillna({'a': 0, 'b': 5, 'd': 7}))
-
- # Series treated same as dict
- assert ray_df_equals_pandas(ray_df.fillna(df.max()), df.fillna(df.max()))
-
-
-@pytest.fixture
-def test_fillna_dataframe():
- frame_data = {'a': [np.nan, 1, 2, np.nan, np.nan],
- 'b': [1, 2, 3, np.nan, np.nan],
- 'c': [np.nan, 1, 2, 3, 4]}
- df = pandas.DataFrame(frame_data, index=list('VWXYZ'))
- ray_df = pd.DataFrame(frame_data, index=list('VWXYZ'))
-
- # df2 may have different index and columns
- df2 = pandas.DataFrame({'a': [np.nan, 10, 20, 30, 40],
- 'b': [50, 60, 70, 80, 90],
- 'foo': ['bar'] * 5},
- index=list('VWXuZ'))
-
- # only those columns and indices which are shared get filled
- assert ray_df_equals_pandas(ray_df.fillna(df2), df.fillna(df2))
-
-
-@pytest.fixture
-def test_fillna_columns():
- frame_data = np.random.randn(10, 10)
- df = pandas.DataFrame(frame_data)
- df.values[:, ::2] = np.nan
- ray_df = pd.DataFrame(df)
-
- assert ray_df_equals_pandas(ray_df.fillna(method='ffill', axis=1),
- df.fillna(method='ffill', axis=1))
-
- df.insert(6, 'foo', 5)
- ray_df = pd.DataFrame(df)
- assert ray_df_equals_pandas(ray_df.fillna(method='ffill', axis=1),
- df.fillna(method='ffill', axis=1))
-
-
-@pytest.fixture
-def test_fillna_invalid_method():
- test_data = TestData()
- ray_df = pd.DataFrame(test_data.frame)
- with tm.assert_raises_regex(ValueError, 'ffil'):
- ray_df.fillna(method='ffil')
-
-
-@pytest.fixture
-def test_fillna_invalid_value():
- test_data = TestData()
- ray_df = pd.DataFrame(test_data.frame)
- # list
- pytest.raises(TypeError, ray_df.fillna, [1, 2])
- # tuple
- pytest.raises(TypeError, ray_df.fillna, (1, 2))
- # TODO: Uncomment when iloc is implemented
- # frame with series
- # pytest.raises(ValueError, ray_df.iloc[:, 0].fillna, ray_df)
-
-
-@pytest.fixture
-def test_fillna_col_reordering():
- cols = ["COL." + str(i) for i in range(5, 0, -1)]
- data = np.random.rand(20, 5)
- df = pandas.DataFrame(index=range(20), columns=cols, data=data)
- ray_df = pd.DataFrame(index=range(20), columns=cols, data=data)
- assert ray_df_equals_pandas(ray_df.fillna(method='ffill'),
- df.fillna(method='ffill'))
-
-
-"""
-TODO: Use this when Arrow issue resolves:
-(https://issues.apache.org/jira/browse/ARROW-2122)
-@pytest.fixture
-def test_fillna_datetime_columns():
- frame_data = {'A': [-1, -2, np.nan],
- 'B': date_range('20130101', periods=3),
- 'C': ['foo', 'bar', None],
- 'D': ['foo2', 'bar2', None]}
- df = pandas.DataFrame(frame_data, index=date_range('20130110', periods=3))
- ray_df = pd.DataFrame(frame_data, index=date_range('20130110', periods=3))
- assert ray_df_equals_pandas(ray_df.fillna('?'), df.fillna('?'))
-
- frame_data = {'A': [-1, -2, np.nan],
- 'B': [pandas.Timestamp('2013-01-01'),
- pandas.Timestamp('2013-01-02'), pandas.NaT],
- 'C': ['foo', 'bar', None],
- 'D': ['foo2', 'bar2', None]}
- df = pandas.DataFrame(frame_data, index=date_range('20130110', periods=3))
- ray_df = pd.DataFrame(frame_data, index=date_range('20130110', periods=3))
- assert ray_df_equals_pandas(ray_df.fillna('?'), df.fillna('?'))
-"""
-
-
-@pytest.fixture
-def test_filter(ray_df, pandas_df, by):
- assert ray_df_equals_pandas(ray_df.filter(items=by['items']),
- pandas_df.filter(items=by['items']))
-
- assert ray_df_equals_pandas(ray_df.filter(regex=by['regex']),
- pandas_df.filter(regex=by['regex']))
-
- assert ray_df_equals_pandas(ray_df.filter(like=by['like']),
- pandas_df.filter(like=by['like']))
-
-
-def test_first():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.first(None)
-
-
-@pytest.fixture
-def test_first_valid_index(ray_df, pandas_df):
- assert ray_df.first_valid_index() == (pandas_df.first_valid_index())
-
-
-def test_floordiv():
- test_inter_df_math("floordiv", simple=False)
-
-
-def test_from_csv():
- with pytest.raises(NotImplementedError):
- pd.DataFrame.from_csv(None)
-
-
-def test_from_dict():
- with pytest.raises(NotImplementedError):
- pd.DataFrame.from_dict(None)
-
-
-def test_from_items():
- with pytest.raises(NotImplementedError):
- pd.DataFrame.from_items(None)
-
-
-def test_from_records():
- with pytest.raises(NotImplementedError):
- pd.DataFrame.from_records(None)
-
-
-def test_ge():
- test_comparison_inter_ops("ge")
-
-
-def test_get_value():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.get_value(None, None)
-
-
-def test_get_values():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.get_values()
-
-
-def test_gt():
- test_comparison_inter_ops("gt")
-
-
-@pytest.fixture
-def test_head(ray_df, pandas_df, n=5):
- assert ray_df_equals_pandas(ray_df.head(n), pandas_df.head(n))
-
-
-def test_hist():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.hist(None)
-
-
-@pytest.fixture
-def test_idxmax(ray_df, pandas_df):
- assert \
- ray_df.idxmax().equals(pandas_df.idxmax())
-
-
-@pytest.fixture
-def test_idxmin(ray_df, pandas_df):
- assert \
- ray_df.idxmin().equals(pandas_df.idxmin())
-
-
-def test_infer_objects():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.infer_objects()
-
-
-@pytest.fixture
-def test_info(ray_df):
- info_string = ray_df.info()
- assert '\n' in info_string
- info_string = ray_df.info(memory_usage=True)
- assert 'memory_usage: ' in info_string
-
-
-@pytest.fixture
-def test_insert(ray_df, pandas_df, loc, column, value):
- ray_df_cp = ray_df.copy()
- pd_df_cp = pandas_df.copy()
-
- ray_df_cp.insert(loc, column, value)
- pd_df_cp.insert(loc, column, value)
-
-
-def test_interpolate():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.interpolate()
-
-
-@pytest.fixture
-def test_items(ray_df, pandas_df):
- ray_items = ray_df.items()
- pandas_items = pandas_df.items()
- for ray_item, pandas_item in zip(ray_items, pandas_items):
- ray_index, ray_series = ray_item
- pandas_index, pandas_series = pandas_item
- assert pandas_series.equals(ray_series)
- assert pandas_index == ray_index
-
-
-@pytest.fixture
-def test_iteritems(ray_df, pandas_df):
- ray_items = ray_df.iteritems()
- pandas_items = pandas_df.iteritems()
- for ray_item, pandas_item in zip(ray_items, pandas_items):
- ray_index, ray_series = ray_item
- pandas_index, pandas_series = pandas_item
- assert pandas_series.equals(ray_series)
- assert pandas_index == ray_index
-
-
-@pytest.fixture
-def test_iterrows(ray_df, pandas_df):
- ray_iterrows = ray_df.iterrows()
- pandas_iterrows = pandas_df.iterrows()
- for ray_row, pandas_row in zip(ray_iterrows, pandas_iterrows):
- ray_index, ray_series = ray_row
- pandas_index, pandas_series = pandas_row
- assert pandas_series.equals(ray_series)
- assert pandas_index == ray_index
-
-
-@pytest.fixture
-def test_itertuples(ray_df, pandas_df):
- # test default
- ray_it_default = ray_df.itertuples()
- pandas_it_default = pandas_df.itertuples()
- for ray_row, pandas_row in zip(ray_it_default, pandas_it_default):
- np.testing.assert_equal(ray_row, pandas_row)
-
- # test all combinations of custom params
- indices = [True, False]
- names = [None, 'NotPandas', 'Pandas']
-
- for index in indices:
- for name in names:
- ray_it_custom = ray_df.itertuples(index=index, name=name)
- pandas_it_custom = pandas_df.itertuples(index=index, name=name)
- for ray_row, pandas_row in zip(ray_it_custom, pandas_it_custom):
- np.testing.assert_equal(ray_row, pandas_row)
-
-
-def test_join():
- frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7],
- "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}
-
- ray_df = pd.DataFrame(frame_data)
- pandas_df = pandas.DataFrame(frame_data)
-
- frame_data2 = {"col5": [0], "col6": [1]}
- ray_df2 = pd.DataFrame(frame_data2)
- pandas_df2 = pandas.DataFrame(frame_data2)
-
- join_types = ["left", "right", "outer", "inner"]
- for how in join_types:
- ray_join = ray_df.join(ray_df2, how=how)
- pandas_join = pandas_df.join(pandas_df2, how=how)
- assert ray_df_equals_pandas(ray_join, pandas_join)
-
- frame_data3 = {"col7": [1, 2, 3, 5, 6, 7, 8]}
-
- ray_df3 = pd.DataFrame(frame_data3)
- pandas_df3 = pandas.DataFrame(frame_data3)
-
- join_types = ["left", "outer", "inner"]
- for how in join_types:
- ray_join = ray_df.join([ray_df2, ray_df3], how=how)
- pandas_join = pandas_df.join([pandas_df2, pandas_df3], how=how)
- assert ray_df_equals_pandas(ray_join, pandas_join)
-
-
-def test_kurt():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.kurt()
-
-
-def test_kurtosis():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.kurtosis()
-
-
-def test_last():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.last(None)
-
-
-@pytest.fixture
-def test_last_valid_index(ray_df, pandas_df):
- assert ray_df.last_valid_index() == (pandas_df.last_valid_index())
-
-
-def test_le():
- test_comparison_inter_ops("le")
-
-
-def test_lookup():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.lookup(None, None)
-
-
-def test_lt():
- test_comparison_inter_ops("lt")
-
-
-def test_mad():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.mad()
-
-
-def test_mask():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.mask(None)
-
-
-@pytest.fixture
-def test_max(ray_df, pandas_df):
- assert ray_series_equals_pandas(ray_df.max(), pandas_df.max())
- assert ray_series_equals_pandas(ray_df.max(axis=1), pandas_df.max(axis=1))
-
-
-@pytest.fixture
-def test_mean(ray_df, pandas_df):
- assert ray_df.mean().equals(pandas_df.mean())
-
-
-@pytest.fixture
-def test_median(ray_df, pandas_df):
- assert ray_df.median().equals(pandas_df.median())
-
-
-def test_melt():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.melt()
-
-
-@pytest.fixture
-def test_memory_usage(ray_df):
- assert type(ray_df.memory_usage()) is pandas.core.series.Series
- assert ray_df.memory_usage(index=True).at['Index'] is not None
- assert ray_df.memory_usage(deep=True).sum() >= \
- ray_df.memory_usage(deep=False).sum()
-
-
-def test_merge():
- frame_data = {"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7],
- "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}
-
- ray_df = pd.DataFrame(frame_data)
- pandas_df = pandas.DataFrame(frame_data)
-
- frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]}
- ray_df2 = pd.DataFrame(frame_data2)
- pandas_df2 = pandas.DataFrame(frame_data2)
-
- join_types = ["outer", "inner"]
- for how in join_types:
- # Defaults
- ray_result = ray_df.merge(ray_df2, how=how)
- pandas_result = pandas_df.merge(pandas_df2, how=how)
- ray_df_equals_pandas(ray_result, pandas_result)
-
- # left_on and right_index
- ray_result = ray_df.merge(ray_df2, how=how, left_on='col1',
- right_index=True)
- pandas_result = pandas_df.merge(pandas_df2, how=how, left_on='col1',
- right_index=True)
- ray_df_equals_pandas(ray_result, pandas_result)
-
- # left_index and right_index
- ray_result = ray_df.merge(ray_df2, how=how, left_index=True,
- right_index=True)
- pandas_result = pandas_df.merge(pandas_df2, how=how, left_index=True,
- right_index=True)
- ray_df_equals_pandas(ray_result, pandas_result)
-
- # left_index and right_on
- ray_result = ray_df.merge(ray_df2, how=how, left_index=True,
- right_on='col1')
- pandas_result = pandas_df.merge(pandas_df2, how=how, left_index=True,
- right_on='col1')
- ray_df_equals_pandas(ray_result, pandas_result)
-
- # left_on and right_on col1
- ray_result = ray_df.merge(ray_df2, how=how, left_on='col1',
- right_on='col1')
- pandas_result = pandas_df.merge(pandas_df2, how=how, left_on='col1',
- right_on='col1')
- ray_df_equals_pandas(ray_result, pandas_result)
-
- # left_on and right_on col2
- ray_result = ray_df.merge(ray_df2, how=how, left_on='col2',
- right_on='col2')
- pandas_result = pandas_df.merge(pandas_df2, how=how, left_on='col2',
- right_on='col2')
- ray_df_equals_pandas(ray_result, pandas_result)
-
-
-@pytest.fixture
-def test_min(ray_df, pandas_df):
- assert ray_series_equals_pandas(ray_df.min(), pandas_df.min())
- assert ray_series_equals_pandas(ray_df.min(axis=1), pandas_df.min(axis=1))
-
-
-def test_mod():
- test_inter_df_math("mod", simple=False)
-
-
-@pytest.fixture
-def test_mode(ray_df, pandas_df):
- assert ray_series_equals_pandas(ray_df.mode(), pandas_df.mode())
- assert ray_series_equals_pandas(ray_df.mode(axis=1),
- pandas_df.mode(axis=1))
-
-
-def test_mul():
- test_inter_df_math("mul", simple=False)
-
-
-def test_multiply():
- test_inter_df_math("multiply", simple=False)
-
-
-def test_ne():
- test_comparison_inter_ops("ne")
-
-
-def test_nlargest():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.nlargest(None, None)
-
-
-@pytest.fixture
-def test_notna(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.notna(), pandas_df.notna())
-
-
-@pytest.fixture
-def test_notnull(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.notnull(), pandas_df.notnull())
-
-
-def test_nsmallest():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.nsmallest(None, None)
-
-
-@pytest.fixture
-def test_nunique(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.nunique(), pandas_df.nunique())
- assert ray_df_equals_pandas(ray_df.nunique(axis=1),
- pandas_df.nunique(axis=1))
-
-
-def test_pct_change():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.pct_change()
-
-
-@pytest.fixture
-def test_pipe(ray_df, pandas_df):
- n = len(ray_df.index)
- a, b, c = 2 % n, 0, 3 % n
- col = ray_df.columns[3 % len(ray_df.columns)]
-
- def h(x):
- return x.drop(columns=[col])
-
- def g(x, arg1=0):
- for _ in range(arg1):
- x = x.append(x)
- return x
-
- def f(x, arg2=0, arg3=0):
- return x.drop([arg2, arg3])
-
- assert ray_df_equals(f(g(h(ray_df), arg1=a), arg2=b, arg3=c),
- (ray_df.pipe(h)
- .pipe(g, arg1=a)
- .pipe(f, arg2=b, arg3=c)))
-
- assert ray_df_equals_pandas((ray_df.pipe(h)
- .pipe(g, arg1=a)
- .pipe(f, arg2=b, arg3=c)),
- (pandas_df.pipe(h)
- .pipe(g, arg1=a)
- .pipe(f, arg2=b, arg3=c)))
-
-
-def test_pivot():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.pivot()
-
-
-def test_pivot_table():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.pivot_table()
-
-
-def test_plot():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.plot()
-
-
-@pytest.fixture
-def test_pop(ray_df, pandas_df):
- temp_ray_df = ray_df.copy()
- temp_pandas_df = pandas_df.copy()
- ray_popped = temp_ray_df.pop('col2')
- pandas_popped = temp_pandas_df.pop('col2')
- assert ray_popped.equals(pandas_popped)
- assert ray_df_equals_pandas(temp_ray_df, temp_pandas_df)
-
-
-def test_pow():
- test_inter_df_math("pow", simple=False)
-
-
-@pytest.fixture
-def test_prod(ray_df, pandas_df):
- assert ray_df.prod().equals(pandas_df.prod())
-
-
-@pytest.fixture
-def test_product(ray_df, pandas_df):
- assert ray_df.product().equals(pandas_df.product())
-
-
-@pytest.fixture
-def test_quantile(ray_df, pandas_df, q):
- assert ray_df.quantile(q).equals(pandas_df.quantile(q))
-
-
-@pytest.fixture
-def test_query(ray_df, pandas_df, funcs):
- for f in funcs:
- pandas_df_new, ray_df_new = pandas_df.query(f), ray_df.query(f)
- assert pandas_df_new.equals(to_pandas(ray_df_new))
-
-
-def test_radd():
- test_inter_df_math_right_ops("radd")
-
-
-@pytest.fixture
-def test_rank(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.rank(), pandas_df.rank())
- assert ray_df_equals_pandas(ray_df.rank(axis=1), pandas_df.rank(axis=1))
-
-
-def test_rdiv():
- test_inter_df_math_right_ops("rdiv")
-
-
-def test_reindex():
- frame_data = {'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]}
- pandas_df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
-
- assert ray_df_equals_pandas(
- ray_df.reindex([0, 3, 2, 1]), pandas_df.reindex([0, 3, 2, 1]))
-
- assert ray_df_equals_pandas(
- ray_df.reindex([0, 6, 2]), pandas_df.reindex([0, 6, 2]))
-
- assert ray_df_equals_pandas(
- ray_df.reindex(['col1', 'col3', 'col4', 'col2'], axis=1),
- pandas_df.reindex(['col1', 'col3', 'col4', 'col2'], axis=1))
-
- assert ray_df_equals_pandas(
- ray_df.reindex(['col1', 'col7', 'col4', 'col8'], axis=1),
- pandas_df.reindex(['col1', 'col7', 'col4', 'col8'], axis=1))
-
- assert ray_df_equals_pandas(
- ray_df.reindex(index=[0, 1, 5],
- columns=['col1', 'col7', 'col4', 'col8']),
- pandas_df.reindex(index=[0, 1, 5],
- columns=['col1', 'col7', 'col4', 'col8']))
-
-
-def test_reindex_axis():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.reindex_axis(None)
-
-
-def test_reindex_like():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.reindex_like(None)
-
-
-# Renaming
-
-def test_rename():
- test_rename_sanity()
- test_rename_multiindex()
- # TODO: Uncomment when __setitem__ is implemented
- # test_rename_nocopy()
- test_rename_inplace()
- test_rename_bug()
-
-
-@pytest.fixture
-def test_rename_sanity():
- test_data = TestData()
- mapping = {'A': 'a', 'B': 'b', 'C': 'c', 'D': 'd'}
-
- ray_df = pd.DataFrame(test_data.frame)
- assert ray_df_equals_pandas(ray_df.rename(columns=mapping),
- test_data.frame.rename(columns=mapping))
-
- renamed2 = test_data.frame.rename(columns=str.lower)
- assert ray_df_equals_pandas(ray_df.rename(columns=str.lower), renamed2)
-
- ray_df = pd.DataFrame(renamed2)
- assert ray_df_equals_pandas(ray_df.rename(columns=str.upper),
- renamed2.rename(columns=str.upper))
-
- # index
- data = {'A': {'foo': 0, 'bar': 1}}
-
- # gets sorted alphabetical
- df = pandas.DataFrame(data)
- ray_df = pd.DataFrame(data)
- tm.assert_index_equal(
- ray_df.rename(index={'foo': 'bar', 'bar': 'foo'}).index,
- df.rename(index={'foo': 'bar', 'bar': 'foo'}).index)
-
- tm.assert_index_equal(ray_df.rename(index=str.upper).index,
- df.rename(index=str.upper).index)
-
- # have to pass something
- pytest.raises(TypeError, ray_df.rename)
-
- # partial columns
- renamed = test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'})
- ray_df = pd.DataFrame(test_data.frame)
- tm.assert_index_equal(
- ray_df.rename(columns={'C': 'foo', 'D': 'bar'}).index,
- test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'}).index)
-
- # TODO: Uncomment when transpose works
- # other axis
- # renamed = test_data.frame.T.rename(index={'C': 'foo', 'D': 'bar'})
- # tm.assert_index_equal(
- # test_data.frame.T.rename(index={'C': 'foo', 'D': 'bar'}).index,
- # ray_df.T.rename(index={'C': 'foo', 'D': 'bar'}).index)
-
- # index with name
- index = pandas.Index(['foo', 'bar'], name='name')
- renamer = pandas.DataFrame(data, index=index)
- ray_df = pd.DataFrame(data, index=index)
-
- renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'})
- ray_renamed = ray_df.rename(index={'foo': 'bar', 'bar': 'foo'})
- tm.assert_index_equal(renamed.index, ray_renamed.index)
-
- assert renamed.index.name == ray_renamed.index.name
-
-
-@pytest.fixture
-def test_rename_multiindex():
- tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')]
- tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')]
- index = pandas.MultiIndex.from_tuples(tuples_index, names=['foo', 'bar'])
- columns = pandas.MultiIndex.from_tuples(
- tuples_columns, names=['fizz', 'buzz'])
-
- frame_data = [(0, 0), (1, 1)]
- df = pandas.DataFrame(frame_data, index=index, columns=columns)
- ray_df = pd.DataFrame(frame_data, index=index, columns=columns)
-
- #
- # without specifying level -> accross all levels
- renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'},
- columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'})
- ray_renamed = ray_df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'},
- columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'})
- tm.assert_index_equal(renamed.index, ray_renamed.index)
-
- renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'},
- columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'})
- tm.assert_index_equal(renamed.columns, ray_renamed.columns)
- assert renamed.index.names == ray_renamed.index.names
- assert renamed.columns.names == ray_renamed.columns.names
-
- #
- # with specifying a level
-
- # dict
- renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
- level=0)
- ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
- level=0)
- tm.assert_index_equal(renamed.columns, ray_renamed.columns)
- renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
- level='fizz')
- ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
- level='fizz')
- tm.assert_index_equal(renamed.columns, ray_renamed.columns)
-
- renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, level=1)
- ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
- level=1)
- tm.assert_index_equal(renamed.columns, ray_renamed.columns)
- renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
- level='buzz')
- ray_renamed = ray_df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'},
- level='buzz')
- tm.assert_index_equal(renamed.columns, ray_renamed.columns)
-
- # function
- func = str.upper
- renamed = df.rename(columns=func, level=0)
- ray_renamed = ray_df.rename(columns=func, level=0)
- tm.assert_index_equal(renamed.columns, ray_renamed.columns)
- renamed = df.rename(columns=func, level='fizz')
- ray_renamed = ray_df.rename(columns=func, level='fizz')
- tm.assert_index_equal(renamed.columns, ray_renamed.columns)
-
- renamed = df.rename(columns=func, level=1)
- ray_renamed = ray_df.rename(columns=func, level=1)
- tm.assert_index_equal(renamed.columns, ray_renamed.columns)
- renamed = df.rename(columns=func, level='buzz')
- ray_renamed = ray_df.rename(columns=func, level='buzz')
- tm.assert_index_equal(renamed.columns, ray_renamed.columns)
-
- # index
- renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, level=0)
- ray_renamed = ray_df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'},
- level=0)
- tm.assert_index_equal(ray_renamed.index, renamed.index)
-
-
-@pytest.fixture
-def test_rename_nocopy():
- test_data = TestData().frame
- ray_df = pd.DataFrame(test_data)
- ray_renamed = ray_df.rename(columns={'C': 'foo'}, copy=False)
- ray_renamed['foo'] = 1
- assert (ray_df['C'] == 1).all()
-
-
-@pytest.fixture
-def test_rename_inplace():
- test_data = TestData().frame
- ray_df = pd.DataFrame(test_data)
-
- assert ray_df_equals_pandas(ray_df.rename(columns={'C': 'foo'}),
- test_data.rename(columns={'C': 'foo'}))
-
- frame = test_data.copy()
- ray_frame = ray_df.copy()
- frame.rename(columns={'C': 'foo'}, inplace=True)
- ray_frame.rename(columns={'C': 'foo'}, inplace=True)
-
- assert ray_df_equals_pandas(ray_frame, frame)
-
-
-@pytest.fixture
-def test_rename_bug():
- # rename set ref_locs, and set_index was not resetting
- frame_data = {0: ['foo', 'bar'], 1: ['bah', 'bas'], 2: [1, 2]}
- df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
- df = df.rename(columns={0: 'a'})
- df = df.rename(columns={1: 'b'})
- # TODO: Uncomment when set_index is implemented
- # df = df.set_index(['a', 'b'])
- # df.columns = ['2001-01-01']
-
- ray_df = ray_df.rename(columns={0: 'a'})
- ray_df = ray_df.rename(columns={1: 'b'})
- # TODO: Uncomment when set_index is implemented
- # ray_df = ray_df.set_index(['a', 'b'])
- # ray_df.columns = ['2001-01-01']
-
- assert ray_df_equals_pandas(ray_df, df)
-
-
-def test_rename_axis_inplace():
- test_frame = TestData().frame
- ray_df = pd.DataFrame(test_frame)
-
- result = test_frame.copy()
- ray_result = ray_df.copy()
- no_return = result.rename_axis('foo', inplace=True)
- ray_no_return = ray_result.rename_axis('foo', inplace=True)
-
- assert no_return is ray_no_return
- assert ray_df_equals_pandas(ray_result, result)
-
- result = test_frame.copy()
- ray_result = ray_df.copy()
- no_return = result.rename_axis('bar', axis=1, inplace=True)
- ray_no_return = ray_result.rename_axis('bar', axis=1, inplace=True)
-
- assert no_return is ray_no_return
- assert ray_df_equals_pandas(ray_result, result)
-
-
-def test_reorder_levels():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.reorder_levels(None)
-
-
-def test_replace():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.replace()
-
-
-def test_resample():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.resample(None)
-
-
-@pytest.fixture
-def test_reset_index(ray_df, pandas_df, inplace=False):
- if not inplace:
- assert to_pandas(ray_df.reset_index(inplace=inplace)).equals(
- pandas_df.reset_index(inplace=inplace))
- else:
- ray_df_cp = ray_df.copy()
- pd_df_cp = pandas_df.copy()
- ray_df_cp.reset_index(inplace=inplace)
- pd_df_cp.reset_index(inplace=inplace)
- assert to_pandas(ray_df_cp).equals(pd_df_cp)
-
-
-@pytest.mark.skip(reason="dtypes on different partitions may not match up, "
- "no fix for this yet")
-def test_rfloordiv():
- test_inter_df_math_right_ops("rfloordiv")
-
-
-@pytest.mark.skip(reason="dtypes on different partitions may not match up, "
- "no fix for this yet")
-def test_rmod():
- test_inter_df_math_right_ops("rmod")
-
-
-def test_rmul():
- test_inter_df_math_right_ops("rmul")
-
-
-def test_rolling():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.rolling(None)
-
-
-@pytest.fixture
-def test_round(ray_df, pd_df):
- assert ray_df_equals_pandas(ray_df.round(), pd_df.round())
- assert ray_df_equals_pandas(ray_df.round(1), pd_df.round(1))
-
-
-def test_rpow():
- test_inter_df_math_right_ops("rpow")
-
-
-def test_rsub():
- test_inter_df_math_right_ops("rsub")
-
-
-@pytest.mark.skip(reason="dtypes on different partitions may not match up, "
- "no fix for this yet")
-def test_rtruediv():
- test_inter_df_math_right_ops("rtruediv")
-
-
-def test_sample():
- ray_df = create_test_dataframe()
- assert len(ray_df.sample(n=4)) == 4
- assert len(ray_df.sample(frac=0.5)) == 2
-
-
-def test_select():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.select(None)
-
-
-def test_select_dtypes():
- frame_data = {'test1': list('abc'),
- 'test2': np.arange(3, 6).astype('u1'),
- 'test3': np.arange(8.0, 11.0, dtype='float64'),
- 'test4': [True, False, True],
- 'test5': pandas.date_range('now', periods=3).values,
- 'test6': list(range(5, 8))}
- df = pandas.DataFrame(frame_data)
- rd = pd.DataFrame(frame_data)
-
- include = np.float, 'integer'
- exclude = np.bool_,
- r = rd.select_dtypes(include=include, exclude=exclude)
-
- e = df[["test2", "test3", "test6"]]
- assert ray_df_equals_pandas(r, e)
-
- try:
- pd.DataFrame().select_dtypes()
- assert False
- except ValueError:
- assert True
-
-
-def test_sem():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.sem()
-
-
-@pytest.fixture
-def test_set_axis(ray_df, pandas_df, label, axis):
- assert to_pandas(ray_df.set_axis(label, axis, inplace=False)).equals(
- pandas_df.set_axis(label, axis, inplace=False))
-
-
-@pytest.fixture
-def test_set_index(ray_df, pandas_df, keys, inplace=False):
- if not inplace:
- assert to_pandas(ray_df.set_index(keys)).equals(
- pandas_df.set_index(keys))
- else:
- ray_df_cp = ray_df.copy()
- pd_df_cp = pandas_df.copy()
- ray_df_cp.set_index(keys, inplace=inplace)
- pd_df_cp.set_index(keys, inplace=inplace)
- assert to_pandas(ray_df_cp).equals(pd_df_cp)
-
-
-def test_set_value():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.set_value(None, None, None)
-
-
-def test_shift():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.shift()
-
-
-@pytest.fixture
-def test_skew(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.skew(), pandas_df.skew())
- assert ray_df_equals_pandas(ray_df.skew(axis=1),
- pandas_df.skew(axis=1))
-
-
-def test_slice_shift():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.slice_shift()
-
-
-def test_sort_index():
- frame_data = np.random.randint(0, 100, size=(1000, 100))
- pandas_df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
-
- pandas_result = pandas_df
- ray_result = ray_df
-
- assert ray_df_equals_pandas(ray_result, pandas_result)
-
- pandas_result = pandas_df.sort_index(ascending=False)
- ray_result = ray_df.sort_index(ascending=False)
-
- assert ray_df_equals_pandas(ray_result, pandas_result)
-
-
-def test_sort_values():
- frame_data = np.random.randint(0, 100, size=(1000, 100))
- pandas_df = pandas.DataFrame(frame_data)
- ray_df = pd.DataFrame(frame_data)
-
- pandas_result = pandas_df.sort_values(by=1)
- ray_result = ray_df.sort_values(by=1)
-
- assert ray_df_equals_pandas(ray_result, pandas_result)
-
- pandas_result = pandas_df.sort_values(by=1, axis=1)
- ray_result = ray_df.sort_values(by=1, axis=1)
-
- assert ray_df_equals_pandas(ray_result, pandas_result)
-
- pandas_result = pandas_df.sort_values(by=[1, 3])
- ray_result = ray_df.sort_values(by=[1, 3])
-
- assert ray_df_equals_pandas(ray_result, pandas_result)
-
- pandas_result = pandas_df.sort_values(by=[1, 67], axis=1)
- ray_result = ray_df.sort_values(by=[1, 67], axis=1)
-
- assert ray_df_equals_pandas(ray_result, pandas_result)
-
-
-def test_sortlevel():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.sortlevel()
-
-
-def test_squeeze():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.squeeze()
-
-
-def test_stack():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.stack()
-
-
-@pytest.fixture
-def test_std(ray_df, pandas_df):
- assert ray_df.std().equals(pandas_df.std())
-
-
-def test_sub():
- test_inter_df_math("sub", simple=False)
-
-
-def test_subtract():
- test_inter_df_math("subtract", simple=False)
-
-
-def test_swapaxes():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.swapaxes(None, None)
-
-
-def test_swaplevel():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.swaplevel()
-
-
-@pytest.fixture
-def test_tail(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.tail(), pandas_df.tail())
-
-
-def test_take():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.take(None)
-
-
-def test_to_records():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.to_records()
-
-
-def test_to_sparse():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.to_sparse()
-
-
-def test_to_string():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.to_string()
-
-
-def test_to_timestamp():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.to_timestamp()
-
-
-def test_to_xarray():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.to_xarray()
-
-
-@pytest.fixture
-def test_transform(ray_df, pandas_df):
- assert ray_df_equals_pandas(ray_df.transform(lambda df: df.isna()),
- pandas_df.transform(lambda df: df.isna()))
- assert ray_df_equals_pandas(ray_df.transform('isna'),
- pandas_df.transform('isna'))
-
-
-def test_truediv():
- test_inter_df_math("truediv", simple=False)
-
-
-def test_truncate():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.truncate()
-
-
-def test_tshift():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.tshift()
-
-
-def test_tz_convert():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.tz_convert(None)
-
-
-def test_tz_localize():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.tz_localize(None)
-
-
-def test_unstack():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.unstack()
-
-
-def test_update():
- df = pd.DataFrame([[1.5, np.nan, 3.],
- [1.5, np.nan, 3.],
- [1.5, np.nan, 3],
- [1.5, np.nan, 3]])
-
- other = pd.DataFrame([[3.6, 2., np.nan],
- [np.nan, np.nan, 7]], index=[1, 3])
-
- df.update(other)
-
- expected = pd.DataFrame([[1.5, np.nan, 3],
- [3.6, 2, 3],
- [1.5, np.nan, 3],
- [1.5, np.nan, 7.]])
- assert ray_df_equals(df, expected)
-
-
-@pytest.fixture
-def test_var(ray_df, pandas_df):
- assert ray_df.var().equals(pandas_df.var())
-
-
-def test_where():
- frame_data = np.random.randn(100, 10)
- pandas_df = pandas.DataFrame(frame_data, columns=list('abcdefghij'))
- ray_df = pd.DataFrame(frame_data, columns=list('abcdefghij'))
-
- pandas_cond_df = pandas_df % 5 < 2
- ray_cond_df = ray_df % 5 < 2
-
- pandas_result = pandas_df.where(pandas_cond_df, -pandas_df)
- ray_result = ray_df.where(ray_cond_df, -ray_df)
-
- assert ray_df_equals_pandas(ray_result, pandas_result)
-
- other = pandas_df.loc[3]
-
- pandas_result = pandas_df.where(pandas_cond_df, other, axis=1)
- ray_result = ray_df.where(ray_cond_df, other, axis=1)
-
- assert ray_df_equals_pandas(ray_result, pandas_result)
-
- other = pandas_df['e']
-
- pandas_result = pandas_df.where(pandas_cond_df, other, axis=0)
- ray_result = ray_df.where(ray_cond_df, other, axis=0)
-
- assert ray_df_equals_pandas(ray_result, pandas_result)
-
- pandas_result = pandas_df.where(pandas_df < 2, True)
- ray_result = ray_df.where(ray_df < 2, True)
-
- assert ray_df_equals_pandas(ray_result, pandas_result)
-
-
-def test_xs():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.xs(None)
-
-
-@pytest.fixture
-def test___getitem__(ray_df, pd_df):
- ray_col = ray_df.__getitem__('col1')
- assert isinstance(ray_col, pandas.Series)
-
- pd_col = pd_df['col1']
- assert pd_col.equals(ray_col)
-
-
-def test___getattr__():
- df = create_test_dataframe()
-
- col = df.__getattr__("col1")
- assert isinstance(col, pandas.Series)
-
- col = getattr(df, "col1")
- assert isinstance(col, pandas.Series)
-
- col = df.col1
- assert isinstance(col, pandas.Series)
-
- # Check that lookup in column doesn't override other attributes
- df2 = df.rename(index=str, columns={"col5": "columns"})
- assert isinstance(df2.columns, pandas.Index)
-
-
-def test___setitem__():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.__setitem__(None, None)
-
-
-@pytest.fixture
-def test___len__(ray_df, pandas_df):
- assert len(ray_df) == len(pandas_df)
-
-
-def test___unicode__():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.__unicode__()
-
-
-@pytest.fixture
-def test___neg__(ray_df, pd_df):
- ray_df_neg = ray_df.__neg__()
- assert pd_df.__neg__().equals(to_pandas(ray_df_neg))
-
-
-def test___invert__():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.__invert__()
-
-
-def test___hash__():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.__hash__()
-
-
-@pytest.fixture
-def test___iter__(ray_df, pd_df):
- ray_iterator = ray_df.__iter__()
-
- # Check that ray_iterator implements the iterator interface
- assert hasattr(ray_iterator, '__iter__')
- assert hasattr(ray_iterator, 'next') or hasattr(ray_iterator, '__next__')
-
- pd_iterator = pd_df.__iter__()
- assert list(ray_iterator) == list(pd_iterator)
-
-
-@pytest.fixture
-def test___contains__(ray_df, key, result):
- assert result == ray_df.__contains__(key)
- assert result == (key in ray_df)
-
-
-def test___nonzero__():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.__nonzero__()
-
-
-def test___bool__():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.__bool__()
-
-
-@pytest.fixture
-def test___abs__(ray_df, pandas_df):
- assert ray_df_equals_pandas(abs(ray_df), abs(pandas_df))
-
-
-def test___round__():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.__round__()
-
-
-@pytest.fixture
-def test___array__(ray_df, pandas_df):
- assert np.array_equal(ray_df.__array__(), pandas_df.__array__())
-
-
-def test___getstate__():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.__getstate__()
-
-
-def test___setstate__():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.__setstate__(None)
-
-
-@pytest.fixture
-def test___delitem__(ray_df, pd_df):
- ray_df = ray_df.copy()
- pd_df = pd_df.copy()
- ray_df.__delitem__('col1')
- pd_df.__delitem__('col1')
- assert ray_df_equals_pandas(ray_df, pd_df)
-
- # Issue 2027
- last_label = pd_df.iloc[:, -1].name
- ray_df.__delitem__(last_label)
- pd_df.__delitem__(last_label)
- ray_df_equals_pandas(ray_df, pd_df)
-
-
-def test___finalize__():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.__finalize__(None)
-
-
-@pytest.fixture
-def test___copy__(ray_df, pd_df):
- ray_df_copy, pd_df_copy = ray_df.__copy__(), pd_df.__copy__()
- assert ray_df_equals_pandas(ray_df_copy, pd_df_copy)
-
-
-@pytest.fixture
-def test___deepcopy__(ray_df, pd_df):
- ray_df_copy, pd_df_copy = ray_df.__deepcopy__(), pd_df.__deepcopy__()
- assert ray_df_equals_pandas(ray_df_copy, pd_df_copy)
-
-
-def test_blocks():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.blocks
-
-
-def test_style():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.style
-
-
-def test_iat():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.iat()
-
-
-def test___rsub__():
- test_inter_df_math_right_ops("__rsub__")
-
-
-@pytest.fixture
-def test_loc(ray_df, pd_df):
- # Singleton
- assert ray_df.loc[0].equals(pd_df.loc[0])
- assert ray_df.loc[0, 'col1'] == pd_df.loc[0, 'col1']
-
- # List
- assert ray_df.loc[[1, 2]].equals(pd_df.loc[[1, 2]])
- assert ray_df.loc[[1, 2], ['col1']].equals(pd_df.loc[[1, 2], ['col1']])
-
- # Slice
- assert ray_df.loc[1:, 'col1'].equals(pd_df.loc[1:, 'col1'])
- assert ray_df.loc[1:2, 'col1'].equals(pd_df.loc[1:2, 'col1'])
- assert ray_df.loc[1:2, 'col1':'col2'].equals(pd_df.loc[1:2, 'col1':'col2'])
-
-
-def test_is_copy():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.is_copy
-
-
-def test___div__():
- test_inter_df_math("__div__", simple=True)
-
-
-def test_at():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.at()
-
-
-def test_ix():
- ray_df = create_test_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.ix()
-
-
-@pytest.fixture
-def test_iloc(ray_df, pd_df):
- # Singleton
- assert ray_df.iloc[0].equals(pd_df.iloc[0])
- assert ray_df.iloc[0, 1] == pd_df.iloc[0, 1]
-
- # List
- assert ray_df.iloc[[1, 2]].equals(pd_df.iloc[[1, 2]])
- assert ray_df.iloc[[1, 2], [1, 0]].equals(pd_df.iloc[[1, 2], [1, 0]])
-
- # Slice
- assert ray_df.iloc[1:, 0].equals(pd_df.iloc[1:, 0])
- assert ray_df.iloc[1:2, 0].equals(pd_df.iloc[1:2, 0])
- assert ray_df.iloc[1:2, 0:2].equals(pd_df.iloc[1:2, 0:2])
-
-
-def test__doc__():
- assert pd.DataFrame.__doc__ != pandas.DataFrame.__doc__
- assert pd.DataFrame.__init__ != pandas.DataFrame.__init__
- for attr, obj in pd.DataFrame.__dict__.items():
- if (callable(obj) or isinstance(obj, property)) \
- and attr != "__init__":
- pd_obj = getattr(pandas.DataFrame, attr, None)
- if callable(pd_obj) or isinstance(pd_obj, property):
- assert obj.__doc__ == pd_obj.__doc__
-
-
-def test_to_datetime():
- frame_data = {'year': [2015, 2016],
- 'month': [2, 3],
- 'day': [4, 5]}
- ray_df = pd.DataFrame(frame_data)
- pd_df = pandas.DataFrame(frame_data)
-
- pd.to_datetime(ray_df).equals(pandas.to_datetime(pd_df))
-
-
-def test_get_dummies():
- frame_data = {'A': ['a', 'b', 'a'],
- 'B': ['b', 'a', 'c'],
- 'C': [1, 2, 3]}
- ray_df = pd.DataFrame(frame_data)
- pd_df = pandas.DataFrame(frame_data)
-
- assert ray_df_equals_pandas(pd.get_dummies(ray_df),
- pandas.get_dummies(pd_df))
diff --git a/python/ray/dataframe/test/test_groupby.py b/python/ray/dataframe/test/test_groupby.py
deleted file mode 100644
index 47945eb37..000000000
--- a/python/ray/dataframe/test/test_groupby.py
+++ /dev/null
@@ -1,589 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import pytest
-import sys
-import pandas
-import numpy as np
-import ray.dataframe as pd
-from ray.dataframe.utils import (
- from_pandas,
- to_pandas)
-
-PY2 = False
-if sys.version_info.major < 3:
- PY2 = True
-
-
-@pytest.fixture
-def ray_df_equals_pandas(ray_df, pandas_df):
- assert isinstance(ray_df, pd.DataFrame)
- assert to_pandas(ray_df).equals(pandas_df)
-
-
-@pytest.fixture
-def ray_df_almost_equals_pandas(ray_df, pandas_df):
- assert isinstance(ray_df, pd.DataFrame)
- difference = to_pandas(ray_df) - pandas_df
- diff_max = difference.max().max()
- assert to_pandas(ray_df).equals(pandas_df) or diff_max < 0.0001
-
-
-@pytest.fixture
-def ray_series_equals_pandas(ray_df, pandas_df):
- assert ray_df.equals(pandas_df)
-
-
-@pytest.fixture
-def ray_df_equals(ray_df1, ray_df2):
- assert to_pandas(ray_df1).equals(to_pandas(ray_df2))
-
-
-@pytest.fixture
-def ray_groupby_equals_pandas(ray_groupby, pandas_groupby):
- for g1, g2 in zip(ray_groupby, pandas_groupby):
- assert g1[0] == g2[0]
- ray_df_equals_pandas(g1[1], g2[1])
-
-
-def test_simple_row_groupby():
- pandas_df = pandas.DataFrame({'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [3, 8, 12, 10],
- 'col4': [17, 13, 16, 15],
- 'col5': [-4, -5, -6, -7]})
-
- ray_df = from_pandas(pandas_df, 2)
-
- by = [1, 2, 1, 2]
- n = 1
-
- ray_groupby = ray_df.groupby(by=by)
- pandas_groupby = pandas_df.groupby(by=by)
-
- ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
- test_ngroups(ray_groupby, pandas_groupby)
- test_skew(ray_groupby, pandas_groupby)
- test_ffill(ray_groupby, pandas_groupby)
- test_sem(ray_groupby, pandas_groupby)
- test_mean(ray_groupby, pandas_groupby)
- test_any(ray_groupby, pandas_groupby)
- test_min(ray_groupby, pandas_groupby)
- test_idxmax(ray_groupby, pandas_groupby)
- test_ndim(ray_groupby, pandas_groupby)
- test_cumsum(ray_groupby, pandas_groupby)
- test_pct_change(ray_groupby, pandas_groupby)
- test_cummax(ray_groupby, pandas_groupby)
-
- apply_functions = [lambda df: df.sum(), lambda df: -df]
- for func in apply_functions:
- test_apply(ray_groupby, pandas_groupby, func)
-
- test_dtypes(ray_groupby, pandas_groupby)
- test_first(ray_groupby, pandas_groupby)
- test_backfill(ray_groupby, pandas_groupby)
- test_cummin(ray_groupby, pandas_groupby)
- test_bfill(ray_groupby, pandas_groupby)
- test_idxmin(ray_groupby, pandas_groupby)
- test_prod(ray_groupby, pandas_groupby)
- test_std(ray_groupby, pandas_groupby)
-
- agg_functions = ['min', 'max']
- for func in agg_functions:
- test_agg(ray_groupby, pandas_groupby, func)
- test_aggregate(ray_groupby, pandas_groupby, func)
-
- test_last(ray_groupby, pandas_groupby)
- test_mad(ray_groupby, pandas_groupby)
- test_rank(ray_groupby, pandas_groupby)
- test_max(ray_groupby, pandas_groupby)
- test_var(ray_groupby, pandas_groupby)
- test_len(ray_groupby, pandas_groupby)
- test_sum(ray_groupby, pandas_groupby)
- test_ngroup(ray_groupby, pandas_groupby)
- test_nunique(ray_groupby, pandas_groupby)
- test_median(ray_groupby, pandas_groupby)
- test_head(ray_groupby, pandas_groupby, n)
- test_cumprod(ray_groupby, pandas_groupby)
- test_cov(ray_groupby, pandas_groupby)
-
- transform_functions = [lambda df: df + 4, lambda df: -df - 10]
- for func in transform_functions:
- test_transform(ray_groupby, pandas_groupby, func)
-
- pipe_functions = [lambda dfgb: dfgb.sum()]
- for func in pipe_functions:
- test_pipe(ray_groupby, pandas_groupby, func)
-
- test_corr(ray_groupby, pandas_groupby)
- test_fillna(ray_groupby, pandas_groupby)
- test_count(ray_groupby, pandas_groupby)
- test_tail(ray_groupby, pandas_groupby, n)
- test_quantile(ray_groupby, pandas_groupby)
- test_take(ray_groupby, pandas_groupby)
-
-
-def test_single_group_row_groupby():
- pandas_df = pandas.DataFrame({'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 36, 7],
- 'col3': [3, 8, 12, 10],
- 'col4': [17, 3, 16, 15],
- 'col5': [-4, 5, -6, -7]})
-
- ray_df = from_pandas(pandas_df, 2)
-
- by = [1, 1, 1, 1]
- n = 6
-
- ray_groupby = ray_df.groupby(by=by)
- pandas_groupby = pandas_df.groupby(by=by)
-
- ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
- test_ngroups(ray_groupby, pandas_groupby)
- test_skew(ray_groupby, pandas_groupby)
- test_ffill(ray_groupby, pandas_groupby)
- test_sem(ray_groupby, pandas_groupby)
- test_mean(ray_groupby, pandas_groupby)
- test_any(ray_groupby, pandas_groupby)
- test_min(ray_groupby, pandas_groupby)
- test_idxmax(ray_groupby, pandas_groupby)
- test_ndim(ray_groupby, pandas_groupby)
- test_cumsum(ray_groupby, pandas_groupby)
- test_pct_change(ray_groupby, pandas_groupby)
- test_cummax(ray_groupby, pandas_groupby)
-
- apply_functions = [lambda df: df.sum(), lambda df: -df]
- for func in apply_functions:
- test_apply(ray_groupby, pandas_groupby, func)
-
- test_dtypes(ray_groupby, pandas_groupby)
- test_first(ray_groupby, pandas_groupby)
- test_backfill(ray_groupby, pandas_groupby)
- test_cummin(ray_groupby, pandas_groupby)
- test_bfill(ray_groupby, pandas_groupby)
- test_idxmin(ray_groupby, pandas_groupby)
- test_prod(ray_groupby, pandas_groupby)
- test_std(ray_groupby, pandas_groupby)
-
- agg_functions = ['min', 'max']
- for func in agg_functions:
- test_agg(ray_groupby, pandas_groupby, func)
- test_aggregate(ray_groupby, pandas_groupby, func)
-
- test_last(ray_groupby, pandas_groupby)
- test_mad(ray_groupby, pandas_groupby)
- test_rank(ray_groupby, pandas_groupby)
- test_max(ray_groupby, pandas_groupby)
- test_var(ray_groupby, pandas_groupby)
- test_len(ray_groupby, pandas_groupby)
- test_sum(ray_groupby, pandas_groupby)
- test_ngroup(ray_groupby, pandas_groupby)
- test_nunique(ray_groupby, pandas_groupby)
- test_median(ray_groupby, pandas_groupby)
- test_head(ray_groupby, pandas_groupby, n)
- test_cumprod(ray_groupby, pandas_groupby)
- test_cov(ray_groupby, pandas_groupby)
-
- transform_functions = [lambda df: df + 4, lambda df: -df - 10]
- for func in transform_functions:
- test_transform(ray_groupby, pandas_groupby, func)
-
- pipe_functions = [lambda dfgb: dfgb.sum()]
- for func in pipe_functions:
- test_pipe(ray_groupby, pandas_groupby, func)
-
- test_corr(ray_groupby, pandas_groupby)
- test_fillna(ray_groupby, pandas_groupby)
- test_count(ray_groupby, pandas_groupby)
- test_tail(ray_groupby, pandas_groupby, n)
- test_quantile(ray_groupby, pandas_groupby)
- test_take(ray_groupby, pandas_groupby)
-
-
-def test_large_row_groupby():
- pandas_df = pandas.DataFrame(np.random.randint(0, 8, size=(100, 4)),
- columns=list('ABCD'))
-
- ray_df = from_pandas(pandas_df, 2)
-
- by = pandas_df['A'].tolist()
- n = 4
-
- ray_groupby = ray_df.groupby(by=by)
- pandas_groupby = pandas_df.groupby(by=by)
-
- ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
- test_ngroups(ray_groupby, pandas_groupby)
- test_skew(ray_groupby, pandas_groupby)
- test_ffill(ray_groupby, pandas_groupby)
- test_sem(ray_groupby, pandas_groupby)
- test_mean(ray_groupby, pandas_groupby)
- test_any(ray_groupby, pandas_groupby)
- test_min(ray_groupby, pandas_groupby)
- test_idxmax(ray_groupby, pandas_groupby)
- test_ndim(ray_groupby, pandas_groupby)
- test_cumsum(ray_groupby, pandas_groupby)
- test_pct_change(ray_groupby, pandas_groupby)
- test_cummax(ray_groupby, pandas_groupby)
-
- apply_functions = [lambda df: df.sum(), lambda df: -df]
- for func in apply_functions:
- test_apply(ray_groupby, pandas_groupby, func)
-
- test_dtypes(ray_groupby, pandas_groupby)
- test_first(ray_groupby, pandas_groupby)
- test_backfill(ray_groupby, pandas_groupby)
- test_cummin(ray_groupby, pandas_groupby)
- test_bfill(ray_groupby, pandas_groupby)
- test_idxmin(ray_groupby, pandas_groupby)
- # test_prod(ray_groupby, pandas_groupby) causes overflows
- test_std(ray_groupby, pandas_groupby)
-
- agg_functions = ['min', 'max']
- for func in agg_functions:
- test_agg(ray_groupby, pandas_groupby, func)
- test_aggregate(ray_groupby, pandas_groupby, func)
-
- test_last(ray_groupby, pandas_groupby)
- test_mad(ray_groupby, pandas_groupby)
- test_rank(ray_groupby, pandas_groupby)
- test_max(ray_groupby, pandas_groupby)
- test_var(ray_groupby, pandas_groupby)
- test_len(ray_groupby, pandas_groupby)
- test_sum(ray_groupby, pandas_groupby)
- test_ngroup(ray_groupby, pandas_groupby)
- test_nunique(ray_groupby, pandas_groupby)
- test_median(ray_groupby, pandas_groupby)
- test_head(ray_groupby, pandas_groupby, n)
- # test_cumprod(ray_groupby, pandas_groupby) causes overflows
- test_cov(ray_groupby, pandas_groupby)
-
- transform_functions = [lambda df: df + 4, lambda df: -df - 10]
- for func in transform_functions:
- test_transform(ray_groupby, pandas_groupby, func)
-
- pipe_functions = [lambda dfgb: dfgb.sum()]
- for func in pipe_functions:
- test_pipe(ray_groupby, pandas_groupby, func)
-
- test_corr(ray_groupby, pandas_groupby)
- test_fillna(ray_groupby, pandas_groupby)
- test_count(ray_groupby, pandas_groupby)
- test_tail(ray_groupby, pandas_groupby, n)
- test_quantile(ray_groupby, pandas_groupby)
- test_take(ray_groupby, pandas_groupby)
-
-
-def test_simple_col_groupby():
- pandas_df = pandas.DataFrame({'col1': [0, 3, 2, 3],
- 'col2': [4, 1, 6, 7],
- 'col3': [3, 8, 2, 10],
- 'col4': [1, 13, 6, 15],
- 'col5': [-4, 5, 6, -7]})
-
- ray_df = from_pandas(pandas_df, 2)
-
- by = [1, 2, 3, 2, 1]
-
- ray_groupby = ray_df.groupby(axis=1, by=by)
- pandas_groupby = pandas_df.groupby(axis=1, by=by)
-
- ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
- test_ngroups(ray_groupby, pandas_groupby)
- test_skew(ray_groupby, pandas_groupby)
- test_ffill(ray_groupby, pandas_groupby)
- test_sem(ray_groupby, pandas_groupby)
- test_mean(ray_groupby, pandas_groupby)
- test_any(ray_groupby, pandas_groupby)
- test_min(ray_groupby, pandas_groupby)
- test_ndim(ray_groupby, pandas_groupby)
-
- if not PY2:
- # idxmax and idxmin fail on column groupby in pandas with python2
- test_idxmax(ray_groupby, pandas_groupby)
- test_idxmin(ray_groupby, pandas_groupby)
- test_rank(ray_groupby, pandas_groupby)
- test_quantile(ray_groupby, pandas_groupby)
-
- # https://github.com/pandas-dev/pandas/issues/21127
- # test_cumsum(ray_groupby, pandas_groupby)
- # test_cummax(ray_groupby, pandas_groupby)
- # test_cummin(ray_groupby, pandas_groupby)
- # test_cumprod(ray_groupby, pandas_groupby)
-
- test_pct_change(ray_groupby, pandas_groupby)
- apply_functions = [lambda df: -df, lambda df: df.sum(axis=1)]
- for func in apply_functions:
- test_apply(ray_groupby, pandas_groupby, func)
-
- test_first(ray_groupby, pandas_groupby)
- test_backfill(ray_groupby, pandas_groupby)
- test_bfill(ray_groupby, pandas_groupby)
- test_prod(ray_groupby, pandas_groupby)
- test_std(ray_groupby, pandas_groupby)
- test_last(ray_groupby, pandas_groupby)
- test_mad(ray_groupby, pandas_groupby)
- test_max(ray_groupby, pandas_groupby)
- test_var(ray_groupby, pandas_groupby)
- test_len(ray_groupby, pandas_groupby)
- test_sum(ray_groupby, pandas_groupby)
-
- # Pandas fails on this case with ValueError
- # test_ngroup(ray_groupby, pandas_groupby)
- # test_nunique(ray_groupby, pandas_groupby)
- test_median(ray_groupby, pandas_groupby)
- test_cov(ray_groupby, pandas_groupby)
-
- transform_functions = [lambda df: df + 4, lambda df: -df - 10]
- for func in transform_functions:
- test_transform(ray_groupby, pandas_groupby, func)
-
- pipe_functions = [lambda dfgb: dfgb.sum()]
- for func in pipe_functions:
- test_pipe(ray_groupby, pandas_groupby, func)
-
- test_corr(ray_groupby, pandas_groupby)
- test_fillna(ray_groupby, pandas_groupby)
- test_count(ray_groupby, pandas_groupby)
- test_take(ray_groupby, pandas_groupby)
-
-
-@pytest.fixture
-def test_ngroups(ray_groupby, pandas_groupby):
- assert ray_groupby.ngroups == pandas_groupby.ngroups
-
-
-@pytest.fixture
-def test_skew(ray_groupby, pandas_groupby):
- ray_df_almost_equals_pandas(ray_groupby.skew(), pandas_groupby.skew())
-
-
-@pytest.fixture
-def test_ffill(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.ffill(), pandas_groupby.ffill())
-
-
-@pytest.fixture
-def test_sem(ray_groupby, pandas_groupby):
- with pytest.raises(NotImplementedError):
- ray_groupby.sem()
-
-
-@pytest.fixture
-def test_mean(ray_groupby, pandas_groupby):
- ray_df_almost_equals_pandas(ray_groupby.mean(), pandas_groupby.mean())
-
-
-@pytest.fixture
-def test_any(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.any(), pandas_groupby.any())
-
-
-@pytest.fixture
-def test_min(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.min(), pandas_groupby.min())
-
-
-@pytest.fixture
-def test_idxmax(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.idxmax(), pandas_groupby.idxmax())
-
-
-@pytest.fixture
-def test_ndim(ray_groupby, pandas_groupby):
- assert ray_groupby.ndim == pandas_groupby.ndim
-
-
-@pytest.fixture
-def test_cumsum(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.cumsum(), pandas_groupby.cumsum())
- ray_df_equals_pandas(ray_groupby.cumsum(axis=1),
- pandas_groupby.cumsum(axis=1))
-
-
-@pytest.fixture
-def test_pct_change(ray_groupby, pandas_groupby):
- with pytest.raises(NotImplementedError):
- ray_groupby.pct_change()
-
-
-@pytest.fixture
-def test_cummax(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.cummax(), pandas_groupby.cummax())
- ray_df_equals_pandas(ray_groupby.cummax(axis=1),
- pandas_groupby.cummax(axis=1))
-
-
-@pytest.fixture
-def test_apply(ray_groupby, pandas_groupby, func):
- ray_df_equals_pandas(ray_groupby.apply(func), pandas_groupby.apply(func))
-
-
-@pytest.fixture
-def test_dtypes(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.dtypes, pandas_groupby.dtypes)
-
-
-@pytest.fixture
-def test_first(ray_groupby, pandas_groupby):
- with pytest.raises(NotImplementedError):
- ray_groupby.first()
-
-
-@pytest.fixture
-def test_backfill(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.backfill(), pandas_groupby.backfill())
-
-
-@pytest.fixture
-def test_cummin(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.cummin(), pandas_groupby.cummin())
- ray_df_equals_pandas(ray_groupby.cummin(axis=1),
- pandas_groupby.cummin(axis=1))
-
-
-@pytest.fixture
-def test_bfill(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.bfill(), pandas_groupby.bfill())
-
-
-@pytest.fixture
-def test_idxmin(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.idxmin(), pandas_groupby.idxmin())
-
-
-@pytest.fixture
-def test_prod(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.prod(), pandas_groupby.prod())
-
-
-@pytest.fixture
-def test_std(ray_groupby, pandas_groupby):
- ray_df_almost_equals_pandas(ray_groupby.std(), pandas_groupby.std())
-
-
-@pytest.fixture
-def test_aggregate(ray_groupby, pandas_groupby, func):
- ray_df_equals_pandas(ray_groupby.aggregate(func),
- pandas_groupby.aggregate(func))
-
-
-@pytest.fixture
-def test_agg(ray_groupby, pandas_groupby, func):
- ray_df_equals_pandas(ray_groupby.agg(func), pandas_groupby.agg(func))
-
-
-@pytest.fixture
-def test_last(ray_groupby, pandas_groupby):
- with pytest.raises(NotImplementedError):
- ray_groupby.last()
-
-
-@pytest.fixture
-def test_mad(ray_groupby, pandas_groupby):
- with pytest.raises(NotImplementedError):
- ray_groupby.mad()
-
-
-@pytest.fixture
-def test_rank(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.rank(), pandas_groupby.rank())
-
-
-@pytest.fixture
-def test_max(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.max(), pandas_groupby.max())
-
-
-@pytest.fixture
-def test_var(ray_groupby, pandas_groupby):
- ray_df_almost_equals_pandas(ray_groupby.var(), pandas_groupby.var())
-
-
-@pytest.fixture
-def test_len(ray_groupby, pandas_groupby):
- assert len(ray_groupby) == len(pandas_groupby)
-
-
-@pytest.fixture
-def test_sum(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.sum(), pandas_groupby.sum())
-
-
-@pytest.fixture
-def test_ngroup(ray_groupby, pandas_groupby):
- ray_series_equals_pandas(ray_groupby.ngroup(), pandas_groupby.ngroup())
-
-
-@pytest.fixture
-def test_nunique(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.nunique(), pandas_groupby.nunique())
-
-
-@pytest.fixture
-def test_median(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.median(), pandas_groupby.median())
-
-
-@pytest.fixture
-def test_head(ray_groupby, pandas_groupby, n):
- ray_df_equals_pandas(ray_groupby.head(n=n), pandas_groupby.head(n=n))
-
-
-@pytest.fixture
-def test_cumprod(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.cumprod(), pandas_groupby.cumprod())
- ray_df_equals_pandas(ray_groupby.cumprod(axis=1),
- pandas_groupby.cumprod(axis=1))
-
-
-@pytest.fixture
-def test_cov(ray_groupby, pandas_groupby):
- with pytest.raises(NotImplementedError):
- ray_groupby.cov()
-
-
-@pytest.fixture
-def test_transform(ray_groupby, pandas_groupby, func):
- ray_df_equals_pandas(ray_groupby.transform(func),
- pandas_groupby.transform(func))
-
-
-@pytest.fixture
-def test_corr(ray_groupby, pandas_groupby):
- with pytest.raises(NotImplementedError):
- ray_groupby.corr()
-
-
-@pytest.fixture
-def test_fillna(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.fillna(method="ffill"),
- pandas_groupby.fillna(method="ffill"))
-
-
-@pytest.fixture
-def test_count(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.count(), pandas_groupby.count())
-
-
-@pytest.fixture
-def test_pipe(ray_groupby, pandas_groupby, func):
- ray_df_equals_pandas(ray_groupby.pipe(func), pandas_groupby.pipe(func))
-
-
-@pytest.fixture
-def test_tail(ray_groupby, pandas_groupby, n):
- ray_df_equals_pandas(ray_groupby.tail(n=n), pandas_groupby.tail(n=n))
-
-
-@pytest.fixture
-def test_quantile(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(ray_groupby.quantile(q=0.4),
- pandas_groupby.quantile(q=0.4))
-
-
-@pytest.fixture
-def test_take(ray_groupby, pandas_groupby):
- with pytest.raises(NotImplementedError):
- ray_groupby.take(indices=[1])
diff --git a/python/ray/dataframe/test/test_io.py b/python/ray/dataframe/test/test_io.py
deleted file mode 100644
index c2ab544be..000000000
--- a/python/ray/dataframe/test/test_io.py
+++ /dev/null
@@ -1,675 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import pytest
-import numpy as np
-import pandas
-from ray.dataframe.utils import to_pandas
-import ray.dataframe as pd
-import os
-import sqlite3
-
-TEST_PARQUET_FILENAME = 'test.parquet'
-TEST_CSV_FILENAME = 'test.csv'
-TEST_JSON_FILENAME = 'test.json'
-TEST_HTML_FILENAME = 'test.html'
-TEST_EXCEL_FILENAME = 'test.xlsx'
-TEST_FEATHER_FILENAME = 'test.feather'
-TEST_HDF_FILENAME = 'test.hdf'
-TEST_MSGPACK_FILENAME = 'test.msg'
-TEST_STATA_FILENAME = 'test.dta'
-TEST_PICKLE_FILENAME = 'test.pkl'
-TEST_SAS_FILENAME = os.getcwd() + '/data/test1.sas7bdat'
-TEST_SQL_FILENAME = 'test.db'
-SMALL_ROW_SIZE = 2000
-LARGE_ROW_SIZE = 7e6
-
-
-@pytest.fixture
-def ray_df_equals_pandas(ray_df, pandas_df):
- return to_pandas(ray_df).sort_index().equals(pandas_df.sort_index())
-
-
-@pytest.fixture
-def setup_parquet_file(row_size, force=False):
- if os.path.exists(TEST_PARQUET_FILENAME) and not force:
- pass
- else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
- df.to_parquet(TEST_PARQUET_FILENAME)
-
-
-@pytest.fixture
-def create_test_ray_dataframe():
- df = pd.DataFrame({'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]})
-
- return df
-
-
-@pytest.fixture
-def create_test_pandas_dataframe():
- df = pandas.DataFrame({'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]})
-
- return df
-
-
-@pytest.fixture
-def test_files_eq(path1, path2):
- with open(path1, 'rb') as file1, open(path2, 'rb') as file2:
- file1_content = file1.read()
- file2_content = file2.read()
-
- if file1_content == file2_content:
- return True
- else:
- return False
-
-
-@pytest.fixture
-def teardown_test_file(test_path):
- if os.path.exists(test_path):
- os.remove(test_path)
-
-
-@pytest.fixture
-def teardown_parquet_file():
- if os.path.exists(TEST_PARQUET_FILENAME):
- os.remove(TEST_PARQUET_FILENAME)
-
-
-@pytest.fixture
-def setup_csv_file(row_size, force=False, delimiter=','):
- if os.path.exists(TEST_CSV_FILENAME) and not force:
- pass
- else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
- df.to_csv(TEST_CSV_FILENAME, sep=delimiter)
-
-
-@pytest.fixture
-def teardown_csv_file():
- if os.path.exists(TEST_CSV_FILENAME):
- os.remove(TEST_CSV_FILENAME)
-
-
-@pytest.fixture
-def setup_json_file(row_size, force=False):
- if os.path.exists(TEST_JSON_FILENAME) and not force:
- pass
- else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
- df.to_json(TEST_JSON_FILENAME)
-
-
-@pytest.fixture
-def teardown_json_file():
- if os.path.exists(TEST_JSON_FILENAME):
- os.remove(TEST_JSON_FILENAME)
-
-
-@pytest.fixture
-def setup_html_file(row_size, force=False):
- if os.path.exists(TEST_HTML_FILENAME) and not force:
- pass
- else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
- df.to_html(TEST_HTML_FILENAME)
-
-
-@pytest.fixture
-def teardown_html_file():
- if os.path.exists(TEST_HTML_FILENAME):
- os.remove(TEST_HTML_FILENAME)
-
-
-@pytest.fixture
-def setup_clipboard(row_size, force=False):
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
- df.to_clipboard()
-
-
-@pytest.fixture
-def setup_excel_file(row_size, force=False):
- if os.path.exists(TEST_EXCEL_FILENAME) and not force:
- pass
- else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
- df.to_excel(TEST_EXCEL_FILENAME)
-
-
-@pytest.fixture
-def teardown_excel_file():
- if os.path.exists(TEST_EXCEL_FILENAME):
- os.remove(TEST_EXCEL_FILENAME)
-
-
-@pytest.fixture
-def setup_feather_file(row_size, force=False):
- if os.path.exists(TEST_FEATHER_FILENAME) and not force:
- pass
- else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
- df.to_feather(TEST_FEATHER_FILENAME)
-
-
-@pytest.fixture
-def teardown_feather_file():
- if os.path.exists(TEST_FEATHER_FILENAME):
- os.remove(TEST_FEATHER_FILENAME)
-
-
-@pytest.fixture
-def setup_hdf_file(row_size, force=False):
- if os.path.exists(TEST_HDF_FILENAME) and not force:
- pass
- else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
- df.to_hdf(TEST_HDF_FILENAME, 'test')
-
-
-@pytest.fixture
-def teardown_hdf_file():
- if os.path.exists(TEST_HDF_FILENAME):
- os.remove(TEST_HDF_FILENAME)
-
-
-@pytest.fixture
-def setup_msgpack_file(row_size, force=False):
- if os.path.exists(TEST_MSGPACK_FILENAME) and not force:
- pass
- else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
- df.to_msgpack(TEST_MSGPACK_FILENAME)
-
-
-@pytest.fixture
-def teardown_msgpack_file():
- if os.path.exists(TEST_MSGPACK_FILENAME):
- os.remove(TEST_MSGPACK_FILENAME)
-
-
-@pytest.fixture
-def setup_stata_file(row_size, force=False):
- if os.path.exists(TEST_STATA_FILENAME) and not force:
- pass
- else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
- df.to_stata(TEST_STATA_FILENAME)
-
-
-@pytest.fixture
-def teardown_stata_file():
- if os.path.exists(TEST_STATA_FILENAME):
- os.remove(TEST_STATA_FILENAME)
-
-
-@pytest.fixture
-def setup_pickle_file(row_size, force=False):
- if os.path.exists(TEST_PICKLE_FILENAME) and not force:
- pass
- else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
- df.to_pickle(TEST_PICKLE_FILENAME)
-
-
-@pytest.fixture
-def teardown_pickle_file():
- if os.path.exists(TEST_PICKLE_FILENAME):
- os.remove(TEST_PICKLE_FILENAME)
-
-
-@pytest.fixture
-def setup_sql_file(conn, force=False):
- if os.path.exists(TEST_SQL_FILENAME) and not force:
- pass
- else:
- df = pandas.DataFrame({'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]})
- df.to_sql(TEST_SQL_FILENAME.split(".")[0], conn)
-
-
-@pytest.fixture
-def teardown_sql_file():
- if os.path.exists(TEST_SQL_FILENAME):
- os.remove(TEST_SQL_FILENAME)
-
-
-def test_from_parquet_small():
-
- setup_parquet_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
- ray_df = pd.read_parquet(TEST_PARQUET_FILENAME)
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
- teardown_parquet_file()
-
-
-def test_from_parquet_large():
- setup_parquet_file(LARGE_ROW_SIZE)
-
- pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
- ray_df = pd.read_parquet(TEST_PARQUET_FILENAME)
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
- teardown_parquet_file()
-
-
-def test_from_csv():
- setup_csv_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_csv(TEST_CSV_FILENAME)
- ray_df = pd.read_csv(TEST_CSV_FILENAME)
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
- teardown_csv_file()
-
-
-def test_from_json():
- setup_json_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_json(TEST_JSON_FILENAME)
- ray_df = pd.read_json(TEST_JSON_FILENAME)
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
- teardown_json_file()
-
-
-def test_from_html():
- setup_html_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_html(TEST_HTML_FILENAME)[0]
- ray_df = pd.read_html(TEST_HTML_FILENAME)
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
- teardown_html_file()
-
-
-@pytest.mark.skip(reason="No clipboard on Travis")
-def test_from_clipboard():
- setup_clipboard(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_clipboard()
- ray_df = pd.read_clipboard()
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
-
-def test_from_excel():
- setup_excel_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_excel(TEST_EXCEL_FILENAME)
- ray_df = pd.read_excel(TEST_EXCEL_FILENAME)
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
- teardown_excel_file()
-
-
-def test_from_feather():
- setup_feather_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_feather(TEST_FEATHER_FILENAME)
- ray_df = pd.read_feather(TEST_FEATHER_FILENAME)
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
- teardown_feather_file()
-
-
-@pytest.mark.skip(reason="Memory overflow on Travis")
-def test_from_hdf():
- setup_hdf_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_hdf(TEST_HDF_FILENAME, key='test')
- ray_df = pd.read_hdf(TEST_HDF_FILENAME, key='test')
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
- teardown_hdf_file()
-
-
-def test_from_msgpack():
- setup_msgpack_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_msgpack(TEST_MSGPACK_FILENAME)
- ray_df = pd.read_msgpack(TEST_MSGPACK_FILENAME)
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
- teardown_msgpack_file()
-
-
-def test_from_stata():
- setup_stata_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_stata(TEST_STATA_FILENAME)
- ray_df = pd.read_stata(TEST_STATA_FILENAME)
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
- teardown_stata_file()
-
-
-def test_from_pickle():
- setup_pickle_file(SMALL_ROW_SIZE)
-
- pandas_df = pandas.read_pickle(TEST_PICKLE_FILENAME)
- ray_df = pd.read_pickle(TEST_PICKLE_FILENAME)
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
- teardown_pickle_file()
-
-
-def test_from_sql():
- conn = sqlite3.connect(TEST_SQL_FILENAME)
- setup_sql_file(conn, True)
-
- pandas_df = pandas.read_sql("select * from test", conn)
- ray_df = pd.read_sql("select * from test", conn)
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
- teardown_sql_file()
-
-
-@pytest.mark.skip(reason="No SAS write methods in Pandas")
-def test_from_sas():
- pandas_df = pandas.read_sas(TEST_SAS_FILENAME)
- ray_df = pd.read_sas(TEST_SAS_FILENAME)
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
-
-def test_from_csv_delimiter():
- setup_csv_file(SMALL_ROW_SIZE, delimiter='|')
-
- pandas_df = pandas.read_csv(TEST_CSV_FILENAME)
- ray_df = pd.read_csv(TEST_CSV_FILENAME)
-
- assert ray_df_equals_pandas(ray_df, pandas_df)
-
- teardown_csv_file()
-
-
-@pytest.mark.skip(reason="No clipboard on Travis")
-def test_to_clipboard():
- ray_df = create_test_ray_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- ray_df.to_clipboard()
- ray_as_clip = pandas.read_clipboard()
-
- pandas_df.to_clipboard()
- pandas_as_clip = pandas.read_clipboard()
-
- assert(ray_as_clip.equals(pandas_as_clip))
-
-
-def test_to_csv():
- ray_df = create_test_ray_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_CSV_DF_FILENAME = "test_df.csv"
- TEST_CSV_pandas_FILENAME = "test_pandas.csv"
-
- ray_df.to_csv(TEST_CSV_DF_FILENAME)
- pandas_df.to_csv(TEST_CSV_pandas_FILENAME)
-
- assert(test_files_eq(TEST_CSV_DF_FILENAME,
- TEST_CSV_pandas_FILENAME))
-
- teardown_test_file(TEST_CSV_pandas_FILENAME)
- teardown_test_file(TEST_CSV_DF_FILENAME)
-
-
-def test_to_dense():
- ray_df = create_test_ray_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.to_dense()
-
-
-def test_to_dict():
- ray_df = create_test_ray_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.to_dict()
-
-
-def test_to_excel():
- ray_df = create_test_ray_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_EXCEL_DF_FILENAME = "test_df.xlsx"
- TEST_EXCEL_pandas_FILENAME = "test_pandas.xlsx"
-
- ray_writer = pandas.ExcelWriter(TEST_EXCEL_DF_FILENAME)
- pandas_writer = pandas.ExcelWriter(TEST_EXCEL_pandas_FILENAME)
-
- ray_df.to_excel(ray_writer)
- pandas_df.to_excel(pandas_writer)
-
- ray_writer.save()
- pandas_writer.save()
-
- assert(test_files_eq(TEST_EXCEL_DF_FILENAME,
- TEST_EXCEL_pandas_FILENAME))
-
- teardown_test_file(TEST_EXCEL_DF_FILENAME)
- teardown_test_file(TEST_EXCEL_pandas_FILENAME)
-
-
-def test_to_feather():
- ray_df = create_test_ray_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_FEATHER_DF_FILENAME = "test_df.feather"
- TEST_FEATHER_pandas_FILENAME = "test_pandas.feather"
-
- ray_df.to_feather(TEST_FEATHER_DF_FILENAME)
- pandas_df.to_feather(TEST_FEATHER_pandas_FILENAME)
-
- assert(test_files_eq(TEST_FEATHER_DF_FILENAME,
- TEST_FEATHER_pandas_FILENAME))
-
- teardown_test_file(TEST_FEATHER_pandas_FILENAME)
- teardown_test_file(TEST_FEATHER_DF_FILENAME)
-
-
-def test_to_gbq():
- ray_df = create_test_ray_dataframe()
-
- TEST_GBQ_DF_FILENAME = "test_df.gbq"
- with pytest.raises(NotImplementedError):
- ray_df.to_gbq(TEST_GBQ_DF_FILENAME, None)
-
-
-def test_to_html():
- ray_df = create_test_ray_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_HTML_DF_FILENAME = "test_df.html"
- TEST_HTML_pandas_FILENAME = "test_pandas.html"
-
- ray_df.to_html(TEST_HTML_DF_FILENAME)
- pandas_df.to_html(TEST_HTML_pandas_FILENAME)
-
- assert(test_files_eq(TEST_HTML_DF_FILENAME,
- TEST_HTML_pandas_FILENAME))
-
- teardown_test_file(TEST_HTML_pandas_FILENAME)
- teardown_test_file(TEST_HTML_DF_FILENAME)
-
-
-def test_to_json():
- ray_df = create_test_ray_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_JSON_DF_FILENAME = "test_df.json"
- TEST_JSON_pandas_FILENAME = "test_pandas.json"
-
- ray_df.to_json(TEST_JSON_DF_FILENAME)
- pandas_df.to_json(TEST_JSON_pandas_FILENAME)
-
- assert(test_files_eq(TEST_JSON_DF_FILENAME,
- TEST_JSON_pandas_FILENAME))
-
- teardown_test_file(TEST_JSON_pandas_FILENAME)
- teardown_test_file(TEST_JSON_DF_FILENAME)
-
-
-def test_to_latex():
- ray_df = create_test_ray_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.to_latex()
-
-
-def test_to_msgpack():
- ray_df = create_test_ray_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_MSGPACK_DF_FILENAME = "test_df.msgpack"
- TEST_MSGPACK_pandas_FILENAME = "test_pandas.msgpack"
-
- ray_df.to_msgpack(TEST_MSGPACK_DF_FILENAME)
- pandas_df.to_msgpack(TEST_MSGPACK_pandas_FILENAME)
-
- assert(test_files_eq(TEST_MSGPACK_DF_FILENAME,
- TEST_MSGPACK_pandas_FILENAME))
-
- teardown_test_file(TEST_MSGPACK_pandas_FILENAME)
- teardown_test_file(TEST_MSGPACK_DF_FILENAME)
-
-
-def test_to_panel():
- ray_df = create_test_ray_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.to_panel()
-
-
-def test_to_parquet():
- ray_df = create_test_ray_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_PARQUET_DF_FILENAME = "test_df.parquet"
- TEST_PARQUET_pandas_FILENAME = "test_pandas.parquet"
-
- ray_df.to_parquet(TEST_PARQUET_DF_FILENAME)
- pandas_df.to_parquet(TEST_PARQUET_pandas_FILENAME)
-
- assert(test_files_eq(TEST_PARQUET_DF_FILENAME,
- TEST_PARQUET_pandas_FILENAME))
-
- teardown_test_file(TEST_PARQUET_pandas_FILENAME)
- teardown_test_file(TEST_PARQUET_DF_FILENAME)
-
-
-def test_to_period():
- ray_df = create_test_ray_dataframe()
-
- with pytest.raises(NotImplementedError):
- ray_df.to_period()
-
-
-def test_to_pickle():
- ray_df = create_test_ray_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_PICKLE_DF_FILENAME = "test_df.pkl"
- TEST_PICKLE_pandas_FILENAME = "test_pandas.pkl"
-
- ray_df.to_pickle(TEST_PICKLE_DF_FILENAME)
- pandas_df.to_pickle(TEST_PICKLE_pandas_FILENAME)
-
- assert(test_files_eq(TEST_PICKLE_DF_FILENAME,
- TEST_PICKLE_pandas_FILENAME))
-
- teardown_test_file(TEST_PICKLE_pandas_FILENAME)
- teardown_test_file(TEST_PICKLE_DF_FILENAME)
-
-
-def test_to_sql():
- ray_df = create_test_ray_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_SQL_DF_FILENAME = "test_df.sql"
- TEST_SQL_pandas_FILENAME = "test_pandas.sql"
-
- ray_df.to_pickle(TEST_SQL_DF_FILENAME)
- pandas_df.to_pickle(TEST_SQL_pandas_FILENAME)
-
- assert(test_files_eq(TEST_SQL_DF_FILENAME,
- TEST_SQL_pandas_FILENAME))
-
- teardown_test_file(TEST_SQL_DF_FILENAME)
- teardown_test_file(TEST_SQL_pandas_FILENAME)
-
-
-def test_to_stata():
- ray_df = create_test_ray_dataframe()
- pandas_df = create_test_pandas_dataframe()
-
- TEST_STATA_DF_FILENAME = "test_df.stata"
- TEST_STATA_pandas_FILENAME = "test_pandas.stata"
-
- ray_df.to_stata(TEST_STATA_DF_FILENAME)
- pandas_df.to_stata(TEST_STATA_pandas_FILENAME)
-
- assert(test_files_eq(TEST_STATA_DF_FILENAME,
- TEST_STATA_pandas_FILENAME))
-
- teardown_test_file(TEST_STATA_pandas_FILENAME)
- teardown_test_file(TEST_STATA_DF_FILENAME)
diff --git a/python/ray/dataframe/test/test_series.py b/python/ray/dataframe/test/test_series.py
deleted file mode 100644
index bf26e32e8..000000000
--- a/python/ray/dataframe/test/test_series.py
+++ /dev/null
@@ -1,1992 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import pytest
-import ray.dataframe as pd
-
-
-@pytest.fixture
-def create_test_series():
- return pd.Series(None)
-
-
-def test_T():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.T
-
-
-def test___abs__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__abs__()
-
-
-def test___add__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__add__(None, None)
-
-
-def test___and__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__and__(None)
-
-
-def test___array__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__array__(None)
-
-
-def test___array_prepare__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__array_prepare__(None)
-
-
-def test___array_priority__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__array_priority__
-
-
-def test___array_wrap__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__array_wrap__(None)
-
-
-def test___bool__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__bool__()
-
-
-def test___bytes__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__bytes__()
-
-
-def test___class__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__class__(None, None, None, None, None)
-
-
-def test___contains__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__contains__(None)
-
-
-def test___copy__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__copy__(None)
-
-
-def test___deepcopy__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__deepcopy__(None)
-
-
-def test___delitem__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__delitem__(None)
-
-
-def test___div__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__div__(None, None)
-
-
-def test___divmod__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__divmod__(None, None)
-
-
-def test___doc__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__doc__
-
-
-def test___eq__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__eq__(None)
-
-
-def test___finalize__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__finalize__(None, None)
-
-
-def test___float__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__float__()
-
-
-def test___floordiv__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__floordiv__(None, None)
-
-
-def test___ge__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__ge__(None)
-
-
-def test___getitem__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__getitem__(None)
-
-
-def test___getstate__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__getstate__()
-
-
-def test___gt__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__gt__(None)
-
-
-def test___iadd__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__iadd__(None)
-
-
-def test___imul__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__imul__(None)
-
-
-def test___int__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__int__()
-
-
-def test___invert__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__invert__()
-
-
-def test___ipow__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__ipow__(None)
-
-
-def test___isub__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__isub__(None)
-
-
-def test___iter__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__iter__()
-
-
-def test___itruediv__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__itruediv__(None)
-
-
-def test___le__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__le__(None)
-
-
-def test___len__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__len__()
-
-
-def test___long__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__long__()
-
-
-def test___lt__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__lt__(None)
-
-
-def test___mod__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__mod__(None, None)
-
-
-def test___mul__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__mul__(None, None)
-
-
-def test___ne__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__ne__(None)
-
-
-def test___neg__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__neg__()
-
-
-def test___nonzero__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__nonzero__()
-
-
-def test___or__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__or__(None)
-
-
-def test___pow__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__pow__(None, None)
-
-
-def test___repr__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__repr__()
-
-
-def test___round__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__round__(None)
-
-
-def test___setitem__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__setitem__(None, None)
-
-
-def test___setstate__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__setstate__(None)
-
-
-def test___sizeof__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__sizeof__()
-
-
-def test___str__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__str__()
-
-
-def test___sub__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__sub__(None, None)
-
-
-def test___truediv__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__truediv__(None, None)
-
-
-def test___xor__():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.__xor__(None)
-
-
-def test_abs():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.abs()
-
-
-def test_add():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.add(None, None, None)
-
-
-def test_add_prefix():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.add_prefix(None)
-
-
-def test_add_suffix():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.add_suffix(None)
-
-
-def test_agg():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.agg(None, None, None)
-
-
-def test_aggregate():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.aggregate(None, None, None)
-
-
-def test_align():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.align(None, None, None, None, None, None, None, None, None)
-
-
-def test_all():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.all(None, None, None, None)
-
-
-def test_any():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.any(None, None, None, None)
-
-
-def test_append():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.append(None, None)
-
-
-def test_apply():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.apply(None, None, None)
-
-
-def test_argmax():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.argmax(None, None, None)
-
-
-def test_argmin():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.argmin(None, None, None)
-
-
-def test_argsort():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.argsort(None, None)
-
-
-def test_as_blocks():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.as_blocks(None)
-
-
-def test_as_matrix():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.as_matrix(None)
-
-
-def test_asfreq():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.asfreq(None, None, None, None)
-
-
-def test_asobject():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.asobject
-
-
-def test_asof():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.asof(None)
-
-
-def test_astype():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.astype(None, None, None)
-
-
-def test_at():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.at(None)
-
-
-def test_at_time():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.at_time(None)
-
-
-def test_autocorr():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.autocorr(None)
-
-
-def test_axes():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.axes
-
-
-def test_base():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.base
-
-
-def test_between():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.between(None, None)
-
-
-def test_between_time():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.between_time(None, None, None)
-
-
-def test_bfill():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.bfill(None, None, None)
-
-
-def test_blocks():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.blocks
-
-
-def test_bool():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.bool()
-
-
-def test_clip():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.clip(None, None, None, None)
-
-
-def test_clip_lower():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.clip_lower(None)
-
-
-def test_clip_upper():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.clip_upper(None)
-
-
-def test_combine():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.combine(None, None)
-
-
-def test_combine_first():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.combine_first(None)
-
-
-def test_compound():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.compound(None, None)
-
-
-def test_compress():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.compress(None, None)
-
-
-def test_consolidate():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.consolidate(None)
-
-
-def test_convert_objects():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.convert_objects(None, None, None)
-
-
-def test_copy():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.copy(None)
-
-
-def test_corr():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.corr(None, None)
-
-
-def test_count():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.count(None)
-
-
-def test_cov():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.cov(None)
-
-
-def test_cummax():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.cummax(None, None, None)
-
-
-def test_cummin():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.cummin(None, None, None)
-
-
-def test_cumprod():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.cumprod(None, None, None)
-
-
-def test_cumsum():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.cumsum(None, None, None)
-
-
-def test_data():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.data
-
-
-def test_describe():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.describe(None, None)
-
-
-def test_diff():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.diff(None)
-
-
-def test_div():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.div(None, None, None)
-
-
-def test_divide():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.divide(None, None, None)
-
-
-def test_dot():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.dot(None)
-
-
-def test_drop():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.drop(None, None, None, None)
-
-
-def test_drop_duplicates():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.drop_duplicates(None)
-
-
-def test_dropna():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.dropna(None, None)
-
-
-def test_dtype():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.dtype
-
-
-def test_dtypes():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.dtypes
-
-
-def test_duplicated():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.duplicated(None)
-
-
-def test_empty():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.empty
-
-
-def test_eq():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.eq(None, None, None)
-
-
-def test_equals():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.equals(None)
-
-
-def test_ewm():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.ewm(None, None, None, None, None, None, None, None)
-
-
-def test_expanding():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.expanding(None, None, None)
-
-
-def test_factorize():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.factorize(None)
-
-
-def test_ffill():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.ffill(None, None, None)
-
-
-def test_fillna():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.fillna(None, None, None, None, None, None)
-
-
-def test_filter():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.filter(None, None, None)
-
-
-def test_first():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.first(None)
-
-
-def test_first_valid_index():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.first_valid_index()
-
-
-def test_flags():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.flags
-
-
-def test_floordiv():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.floordiv(None, None, None)
-
-
-def test_from_array():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.from_array(None, None, None, None, None)
-
-
-def test_from_csv():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.from_csv(None, None, None, None, None, None)
-
-
-def test_ftype():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.ftype
-
-
-def test_ftypes():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.ftypes
-
-
-def test_ge():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.ge(None, None, None)
-
-
-def test_get():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.get(None)
-
-
-def test_get_dtype_counts():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.get_dtype_counts()
-
-
-def test_get_ftype_counts():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.get_ftype_counts()
-
-
-def test_get_value():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.get_value(None)
-
-
-def test_get_values():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.get_values()
-
-
-def test_groupby():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.groupby(None, None, None, None, None, None, None)
-
-
-def test_gt():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.gt(None, None, None)
-
-
-def test_hasnans():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.hasnans
-
-
-def test_head():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.head(None)
-
-
-def test_hist():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.hist(None, None, None, None, None, None, None, None, None)
-
-
-def test_iat():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.iat(None)
-
-
-def test_idxmax():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.idxmax(None, None, None)
-
-
-def test_idxmin():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.idxmin(None, None, None)
-
-
-def test_iloc():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.iloc(None)
-
-
-def test_imag():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.imag
-
-
-def test_index():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.index
-
-
-def test_interpolate():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.interpolate(None, None, None, None, None, None)
-
-
-def test_is_copy():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.is_copy
-
-
-def test_is_monotonic():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.is_monotonic
-
-
-def test_is_monotonic_decreasing():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.is_monotonic_decreasing
-
-
-def test_is_monotonic_increasing():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.is_monotonic_increasing
-
-
-def test_is_unique():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.is_unique
-
-
-def test_isin():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.isin(None)
-
-
-def test_isnull():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.isnull()
-
-
-def test_item():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.item()
-
-
-def test_items():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.items()
-
-
-def test_itemsize():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.itemsize
-
-
-def test_iteritems():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.iteritems()
-
-
-def test_ix():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.ix(None)
-
-
-def test_keys():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.keys()
-
-
-def test_kurt():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.kurt(None, None, None, None)
-
-
-def test_kurtosis():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.kurtosis(None, None, None, None)
-
-
-def test_last():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.last(None)
-
-
-def test_last_valid_index():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.last_valid_index()
-
-
-def test_le():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.le(None, None, None)
-
-
-def test_loc():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.loc(None)
-
-
-def test_lt():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.lt(None, None, None)
-
-
-def test_mad():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.mad(None, None)
-
-
-def test_map():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.map(None)
-
-
-def test_mask():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.mask(None, None, None, None, None, None)
-
-
-def test_max():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.max(None, None, None, None)
-
-
-def test_mean():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.mean(None, None, None, None)
-
-
-def test_median():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.median(None, None, None, None)
-
-
-def test_memory_usage():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.memory_usage(None)
-
-
-def test_min():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.min(None, None, None, None)
-
-
-def test_mod():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.mod(None, None, None)
-
-
-def test_mode():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.mode()
-
-
-def test_mul():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.mul(None, None, None)
-
-
-def test_multiply():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.multiply(None, None, None)
-
-
-def test_name():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.name
-
-
-def test_nbytes():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.nbytes
-
-
-def test_ndim():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.ndim
-
-
-def test_ne():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.ne(None, None, None)
-
-
-def test_nlargest():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.nlargest(None)
-
-
-def test_nonzero():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.nonzero()
-
-
-def test_notnull():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.notnull()
-
-
-def test_nsmallest():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.nsmallest(None)
-
-
-def test_nunique():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.nunique(None)
-
-
-def test_pct_change():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.pct_change(None, None, None, None)
-
-
-def test_pipe():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.pipe(None, None)
-
-
-def test_plot():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.plot(None, None, None, None, None, None, None, None, None,
- None, None, None, None, None, None, None, None, None,
- None, None, None, None, None)
-
-
-def test_pop():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.pop(None)
-
-
-def test_pow():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.pow(None, None, None)
-
-
-def test_prod():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.prod(None, None, None, None)
-
-
-def test_product():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.product(None, None, None, None)
-
-
-def test_ptp():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.ptp(None, None, None, None)
-
-
-def test_put():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.put(None)
-
-
-def test_quantile():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.quantile(None)
-
-
-def test_radd():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.radd(None, None, None)
-
-
-def test_rank():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.rank(None, None, None, None, None)
-
-
-def test_ravel():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.ravel(None)
-
-
-def test_rdiv():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.rdiv(None, None, None)
-
-
-def test_real():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.real
-
-
-def test_reindex():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.reindex(None)
-
-
-def test_reindex_axis():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.reindex_axis(None, None)
-
-
-def test_reindex_like():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.reindex_like(None, None, None, None)
-
-
-def test_rename():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.rename(None)
-
-
-def test_rename_axis():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.rename_axis(None, None, None)
-
-
-def test_reorder_levels():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.reorder_levels(None)
-
-
-def test_repeat():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.repeat(None, None)
-
-
-def test_replace():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.replace(None, None, None, None, None, None)
-
-
-def test_resample():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.resample(None, None, None, None, None, None, None, None,
- None, None, None, None)
-
-
-def test_reset_index():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.reset_index(None, None, None)
-
-
-def test_reshape():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.reshape(None)
-
-
-def test_rfloordiv():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.rfloordiv(None, None, None)
-
-
-def test_rmod():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.rmod(None, None, None)
-
-
-def test_rmul():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.rmul(None, None, None)
-
-
-def test_rolling():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.rolling(None, None, None, None, None, None, None)
-
-
-def test_round():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.round(None, None)
-
-
-def test_rpow():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.rpow(None, None, None)
-
-
-def test_rsub():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.rsub(None, None, None)
-
-
-def test_rtruediv():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.rtruediv(None, None, None)
-
-
-def test_sample():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.sample(None, None, None, None, None)
-
-
-def test_searchsorted():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.searchsorted(None, None)
-
-
-def test_select():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.select(None)
-
-
-def test_sem():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.sem(None, None, None, None, None)
-
-
-def test_set_axis():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.set_axis(None, None)
-
-
-def test_set_value():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.set_value(None, None)
-
-
-def test_shape():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.shape
-
-
-def test_shift():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.shift(None, None)
-
-
-def test_size():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.size
-
-
-def test_skew():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.skew(None, None, None, None)
-
-
-def test_slice_shift():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.slice_shift(None)
-
-
-def test_sort_index():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.sort_index(None, None, None, None, None, None)
-
-
-def test_sort_values():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.sort_values(None, None, None, None)
-
-
-def test_sortlevel():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.sortlevel(None, None)
-
-
-def test_squeeze():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.squeeze(None)
-
-
-def test_std():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.std(None, None, None, None, None)
-
-
-def test_strides():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.strides
-
-
-def test_sub():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.sub(None, None, None)
-
-
-def test_subtract():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.subtract(None, None, None)
-
-
-def test_sum():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.sum(None, None, None, None)
-
-
-def test_swapaxes():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.swapaxes(None, None)
-
-
-def test_swaplevel():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.swaplevel(None, None)
-
-
-def test_tail():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.tail(None)
-
-
-def test_take():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.take(None, None, None, None)
-
-
-def test_to_clipboard():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_clipboard(None, None)
-
-
-def test_to_csv():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_csv(None, None, None, None, None, None, None, None,
- None, None)
-
-
-def test_to_dense():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_dense()
-
-
-def test_to_dict():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_dict()
-
-
-def test_to_excel():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_excel(None, None, None, None, None, None, None, None,
- None, None, None, None, None, None)
-
-
-def test_to_frame():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_frame(None)
-
-
-def test_to_hdf():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_hdf(None, None)
-
-
-def test_to_json():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_json(None, None, None, None, None, None, None)
-
-
-def test_to_latex():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_latex(None, None, None, None, None, None, None, None,
- None, None, None, None, None, None, None, None,
- None, None)
-
-
-def test_to_msgpack():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_msgpack(None, None)
-
-
-def test_to_period():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_period(None)
-
-
-def test_to_pickle():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_pickle(None)
-
-
-def test_to_sparse():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_sparse(None)
-
-
-def test_to_sql():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_sql(None, None, None, None, None, None, None, None)
-
-
-def test_to_string():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_string(None, None, None, None, None, None, None, None)
-
-
-def test_to_timestamp():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_timestamp(None, None)
-
-
-def test_to_xarray():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.to_xarray()
-
-
-def test_tolist():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.tolist()
-
-
-def test_transform():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.transform(None, None)
-
-
-def test_transpose():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.transpose(None)
-
-
-def test_truediv():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.truediv(None, None, None)
-
-
-def test_truncate():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.truncate(None, None, None)
-
-
-def test_tshift():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.tshift(None, None)
-
-
-def test_tz_convert():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.tz_convert(None, None, None)
-
-
-def test_tz_localize():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.tz_localize(None, None, None, None)
-
-
-def test_unique():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.unique()
-
-
-def test_unstack():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.unstack(None)
-
-
-def test_update():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.update(None)
-
-
-def test_valid():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.valid(None)
-
-
-def test_value_counts():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.value_counts(None, None, None, None)
-
-
-def test_values():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.values
-
-
-def test_var():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.var(None, None, None, None, None)
-
-
-def test_view():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.view(None)
-
-
-def test_where():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.where(None, None, None, None, None, None)
-
-
-def test_xs():
- ray_series = create_test_series()
-
- with pytest.raises(NotImplementedError):
- ray_series.xs(None, None, None)
diff --git a/python/ray/dataframe/utils.py b/python/ray/dataframe/utils.py
deleted file mode 100644
index 746ea5915..000000000
--- a/python/ray/dataframe/utils.py
+++ /dev/null
@@ -1,577 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import pandas
-import numpy as np
-import ray
-
-from . import get_npartitions
-
-
-_NAN_BLOCKS = {}
-_MEMOIZER_CAPACITY = 1000 # Capacity per function
-
-
-class LRUCache:
- """A LRUCache implemented with collections.OrderedDict
-
- Notes:
- - OrderedDict will record the order each item is inserted.
- - The head of the queue will be LRU items.
- """
-
- def __init__(self, capacity):
- self.capacity = capacity
- self.cache = collections.OrderedDict()
-
- def __contains__(self, key):
- return key in self.cache
-
- def __getitem__(self, key):
- """Retrieve item from cache and re-insert it to the back of the queue
- """
- value = self.cache.pop(key)
- self.cache[key] = value
- return value
-
- def __setitem__(self, key, value):
- if key in self.cache:
- self.cache.pop(key)
-
- if len(self.cache) >= self.capacity:
- # Pop oldest items at the beginning of the queue
- self.cache.popitem(last=False)
-
- self.cache[key] = value
-
-
-class memoize:
- """A basic memoizer that cache the input and output of the remote function
-
- Notes:
- - How is this implemented?
- This meoizer is implemented by adding a caching layer to the remote
- function's remote attribute. When user call f.remote(*args), we will
- first check against the cache, and then call the ray remote function
- if we can't find the return value in the cache.
- - When should this be used?
- This should be used when we anticipate temporal locality for the
- function. For example, we can reasonally assume users will perform
- columnar operation repetitively over time (like sum() or loc[]).
- - Caveat
- Don't use this decorator if the any argument to the remote function
- will mutate. Following snippet will fail
- ```py
- @memoize
- @ray.remote
- def f(obj):
- ...
-
- mutable_obj = [1]
- oid_1 = f.remote(mutable_obj) # will be cached
-
- mutable_obj.append(3)
- oid_2 = f.remote(mutable_obj) # cache hit!
-
- oid_1 == oid_2 # True!
- ```
- In short, use this function sparingly. The ideal case is that all
- inputs are ray ObjectIDs because they are immutable objects.
- - Future Development
- - Fix the mutability bug
- - Dynamic cache size (Fixed as 1000 for now)
- """
-
- def __init__(self, f):
- # Save of remote function
- self.old_remote_func = f.remote
- self.cache = LRUCache(capacity=_MEMOIZER_CAPACITY)
-
- def remote(self, *args):
- """Return cached result if the arguments are cached
- """
- args = tuple(args)
-
- if args in self.cache:
- cached_result = self.cache[args]
- return cached_result
-
- result = self.old_remote_func(*args)
- self.cache[args] = result
- return result
-
-
-def _get_nan_block_id(n_row=1, n_col=1, transpose=False):
- """A memory efficent way to get a block of NaNs.
-
- Args:
- n_rows(int): number of rows
- n_col(int): number of columns
- transpose(bool): if true, swap rows and columns
- Returns:
- ObjectID of the NaN block
- """
- global _NAN_BLOCKS
- if transpose:
- n_row, n_col = n_col, n_row
- shape = (n_row, n_col)
- if shape not in _NAN_BLOCKS:
- arr = np.tile(np.array(np.NaN), shape)
- _NAN_BLOCKS[shape] = ray.put(pandas.DataFrame(data=arr))
- return _NAN_BLOCKS[shape]
-
-
-def _get_lengths(df):
- """Gets the length of the dataframe.
- Args:
- df: A remote pandas.DataFrame object.
- Returns:
- Returns an integer length of the dataframe object. If the attempt
- fails, returns 0 as the length.
- """
- try:
- return len(df)
- # Because we sometimes have cases where we have summary statistics in our
- # DataFrames
- except TypeError:
- return 0
-
-
-def _get_widths(df):
- """Gets the width (number of columns) of the dataframe.
- Args:
- df: A remote pandas.DataFrame object.
- Returns:
- Returns an integer width of the dataframe object. If the attempt
- fails, returns 0 as the length.
- """
- try:
- return len(df.columns)
- # Because we sometimes have cases where we have summary statistics in our
- # DataFrames
- except TypeError:
- return 0
-
-
-def _partition_pandas_dataframe(df, num_partitions=None, row_chunksize=None):
- """Partitions a Pandas DataFrame object.
- Args:
- df (pandas.DataFrame): The pandas DataFrame to convert.
- npartitions (int): The number of partitions to split the DataFrame
- into. Has priority over chunksize.
- row_chunksize (int): The number of rows to put in each partition.
- Returns:
- [ObjectID]: A list of object IDs corresponding to the dataframe
- partitions
- """
- if num_partitions is not None:
- row_chunksize = len(df) // num_partitions \
- if len(df) % num_partitions == 0 \
- else len(df) // num_partitions + 1
- else:
- assert row_chunksize is not None
-
- temp_df = df
-
- row_partitions = []
- while len(temp_df) > row_chunksize:
- t_df = temp_df[:row_chunksize]
- # reset_index here because we want a pandas.RangeIndex
- # within the partitions. It is smaller and sometimes faster.
- t_df.reset_index(drop=True, inplace=True)
- t_df.columns = pandas.RangeIndex(0, len(t_df.columns))
- top = ray.put(t_df)
- row_partitions.append(top)
- temp_df = temp_df[row_chunksize:]
- else:
- # Handle the last chunk correctly.
- # This call is necessary to prevent modifying original df
- temp_df = temp_df[:]
- temp_df.reset_index(drop=True, inplace=True)
- temp_df.columns = pandas.RangeIndex(0, len(temp_df.columns))
- row_partitions.append(ray.put(temp_df))
-
- return row_partitions
-
-
-def from_pandas(df, num_partitions=None, chunksize=None):
- """Converts a pandas DataFrame to a Ray DataFrame.
- Args:
- df (pandas.DataFrame): The pandas DataFrame to convert.
- num_partitions (int): The number of partitions to split the DataFrame
- into. Has priority over chunksize.
- chunksize (int): The number of rows to put in each partition.
- Returns:
- A new Ray DataFrame object.
- """
- from .dataframe import DataFrame
-
- row_partitions = \
- _partition_pandas_dataframe(df, num_partitions, chunksize)
-
- return DataFrame(row_partitions=row_partitions,
- columns=df.columns,
- index=df.index)
-
-
-def to_pandas(df):
- """Converts a Ray DataFrame to a pandas DataFrame/Series.
- Args:
- df (ray.DataFrame): The Ray DataFrame to convert.
- Returns:
- A new pandas DataFrame.
- """
- pandas_df = pandas.concat(ray.get(df._row_partitions), copy=False)
- pandas_df.index = df.index
- pandas_df.columns = df.columns
- return pandas_df
-
-
-@ray.remote
-def extractor(df_chunk, row_loc, col_loc):
- """Retrieve an item from remote block
- """
- # We currently have to do the writable flag trick because a pandas bug
- # https://github.com/pandas-dev/pandas/issues/17192
- try:
- row_loc.flags.writeable = True
- col_loc.flags.writeable = True
- except AttributeError:
- # Locators might be scaler or python list
- pass
- return df_chunk.iloc[row_loc, col_loc]
-
-
-@ray.remote
-def writer(df_chunk, row_loc, col_loc, item):
- """Make a copy of the block and write new item to it
- """
- df_chunk = df_chunk.copy()
- df_chunk.iloc[row_loc, col_loc] = item
- return df_chunk
-
-
-def _mask_block_partitions(blk_partitions, row_metadata, col_metadata):
- """Return the squeezed/expanded block partitions as defined by
- row_metadata and col_metadata.
-
- Note:
- Very naive implementation. Extract one scaler at a time in a double
- for loop.
- """
- col_df = col_metadata._coord_df
- row_df = row_metadata._coord_df
-
- result_oids = []
- shape = (len(row_df.index), len(col_df.index))
-
- for _, row_partition_data in row_df.iterrows():
- for _, col_partition_data in col_df.iterrows():
- row_part = row_partition_data.partition
- col_part = col_partition_data.partition
- block_oid = blk_partitions[row_part, col_part]
-
- row_idx = row_partition_data['index_within_partition']
- col_idx = col_partition_data['index_within_partition']
-
- result_oid = extractor.remote(block_oid, [row_idx], [col_idx])
- result_oids.append(result_oid)
- return np.array(result_oids).reshape(shape)
-
-
-@ray.remote
-def _deploy_func(func, dataframe, *args):
- """Deploys a function for the _map_partitions call.
- Args:
- dataframe (pandas.DataFrame): The pandas DataFrame for this partition.
- Returns:
- A futures object representing the return value of the function
- provided.
- """
- if len(args) == 0:
- return func(dataframe)
- else:
- return func(dataframe, *args)
-
-
-def _map_partitions(func, partitions, *argslists):
- """Apply a function across the specified axis
-
- Args:
- func (callable): The function to apply
- partitions ([ObjectID]): The list of partitions to map func on.
-
- Returns:
- A list of partitions ([ObjectID]) with the result of the function
- """
- if partitions is None:
- return None
-
- assert(callable(func))
- if len(argslists) == 0:
- return [_deploy_func.remote(func, part) for part in partitions]
- elif len(argslists) == 1:
- return [_deploy_func.remote(func, part, argslists[0])
- for part in partitions]
- else:
- assert(all(len(args) == len(partitions) for args in argslists))
- return [_deploy_func.remote(func, *args)
- for args in zip(partitions, *argslists)]
-
-
-@ray.remote
-def _build_col_widths(df_col):
- """Compute widths (# of columns) for each partition."""
- widths = np.array(ray.get([_deploy_func.remote(_get_widths, d)
- for d in df_col]))
-
- return widths
-
-
-@ray.remote
-def _build_row_lengths(df_row):
- """Compute lengths (# of rows) for each partition."""
- lengths = np.array(ray.get([_deploy_func.remote(_get_lengths, d)
- for d in df_row]))
-
- return lengths
-
-
-@ray.remote
-def _build_coord_df(lengths, index):
- """Build the coordinate dataframe over all partitions."""
- filtered_lengths = [x for x in lengths if x > 0]
- coords = None
- if len(filtered_lengths) > 0:
- coords = np.vstack([np.column_stack((np.full(l, i), np.arange(l)))
- for i, l in enumerate(filtered_lengths)])
- col_names = ("partition", "index_within_partition")
- return pandas.DataFrame(coords, index=index, columns=col_names)
-
-
-def _create_block_partitions(partitions, axis=0, length=None):
-
- if length is not None and length != 0 and get_npartitions() > length:
- npartitions = length
- elif length == 0:
- npartitions = 1
- else:
- npartitions = get_npartitions()
-
- x = [create_blocks._submit(args=(partition, npartitions, axis),
- num_return_vals=npartitions)
- for partition in partitions]
-
- # In the case that axis is 1 we have to transpose because we build the
- # columns into rows. Fortunately numpy is efficient at this.
- blocks = np.array(x) if axis == 0 else np.array(x).T
-
- # Sometimes we only get a single column or row, which is
- # problematic for building blocks from the partitions, so we
- # add whatever dimension we're missing from the input.
- return fix_blocks_dimensions(blocks, axis)
-
-
-@ray.remote
-def create_blocks(df, npartitions, axis):
- return create_blocks_helper(df, npartitions, axis)
-
-
-def create_blocks_helper(df, npartitions, axis):
- # Single partition dataframes don't need to be repartitioned
- if npartitions == 1:
- return df
- # In the case that the size is not a multiple of the number of partitions,
- # we need to add one to each partition to avoid losing data off the end
- block_size = df.shape[axis ^ 1] // npartitions \
- if df.shape[axis ^ 1] % npartitions == 0 \
- else df.shape[axis ^ 1] // npartitions + 1
-
- # if not isinstance(df.columns, pandas.RangeIndex):
- # df.columns = pandas.RangeIndex(0, len(df.columns))
-
- blocks = [df.iloc[:, i * block_size: (i + 1) * block_size]
- if axis == 0
- else df.iloc[i * block_size: (i + 1) * block_size, :]
- for i in range(npartitions)]
-
- for block in blocks:
- block.columns = pandas.RangeIndex(0, len(block.columns))
- block.reset_index(inplace=True, drop=True)
- return blocks
-
-
-@memoize
-@ray.remote
-def _blocks_to_col(*partition):
- if len(partition):
- return pandas.concat(partition, axis=0, copy=False)\
- .reset_index(drop=True)
- else:
- return pandas.Series()
-
-
-@memoize
-@ray.remote
-def _blocks_to_row(*partition):
- row_part = pandas.concat(partition, axis=1, copy=False)\
- .reset_index(drop=True)
- # Because our block partitions contain different indices (for the
- # columns), this change is needed to ensure correctness.
- row_part.columns = pandas.RangeIndex(0, len(row_part.columns))
- return row_part
-
-
-def _inherit_docstrings(parent, excluded=[]):
- """Creates a decorator which overwrites a decorated class' __doc__
- attribute with parent's __doc__ attribute. Also overwrites __doc__ of
- methods and properties defined in the class with the __doc__ of matching
- methods and properties in parent.
-
- Args:
- parent (object): Class from which the decorated class inherits __doc__.
- excluded (list): List of parent objects from which the class does not
- inherit docstrings.
-
- Returns:
- function: decorator which replaces the decorated class' documentation
- parent's documentation.
- """
- def decorator(cls):
- if parent not in excluded:
- cls.__doc__ = parent.__doc__
- for attr, obj in cls.__dict__.items():
- parent_obj = getattr(parent, attr, None)
- if parent_obj in excluded or \
- (not callable(parent_obj) and
- not isinstance(parent_obj, property)):
- continue
- if callable(obj):
- obj.__doc__ = parent_obj.__doc__
- elif isinstance(obj, property) and obj.fget is not None:
- p = property(obj.fget, obj.fset, obj.fdel, parent_obj.__doc__)
- setattr(cls, attr, p)
-
- return cls
-
- return decorator
-
-
-@ray.remote
-def _reindex_helper(old_index, new_index, axis, npartitions, *df):
- """Reindexes a dataframe to prepare for join/concat.
-
- Args:
- df: The DataFrame partition
- old_index: The index/column for this partition.
- new_index: The new index/column to assign.
- axis: Which axis to reindex over.
-
- Returns:
- A new set of blocks made up of DataFrames.
- """
- df = pandas.concat(df, axis=axis ^ 1)
- if axis == 1:
- df.index = old_index
- elif axis == 0:
- df.columns = old_index
-
- df = df.reindex(new_index, copy=False, axis=axis ^ 1)
- return create_blocks_helper(df, npartitions, axis)
-
-
-@ray.remote
-def _co_op_helper(func, left_columns, right_columns, left_df_len, left_idx,
- *zipped):
- """Copartition operation where two DataFrames must have aligned indexes.
-
- NOTE: This function assumes things are already copartitioned. Requires that
- row partitions are passed in as blocks.
-
- Args:
- func: The operation to conduct between two DataFrames.
- left_columns: The column names for the left DataFrame.
- right_columns: The column names for the right DataFrame.
- left_df_len: The length of the left. This is used so we can split up
- the zipped partitions.
- zipped: The DataFrame partitions (in blocks).
-
- Returns:
- A new set of blocks for the partitioned DataFrame.
- """
- left = pandas.concat(zipped[:left_df_len], axis=1, copy=False).copy()
- left.columns = left_columns
- if left_idx is not None:
- left.index = left_idx
-
- right = pandas.concat(zipped[left_df_len:], axis=1, copy=False).copy()
- right.columns = right_columns
-
- new_rows = func(left, right)
-
- new_blocks = create_blocks_helper(new_rows, left_df_len, 0)
-
- if left_idx is not None:
- new_blocks.append(new_rows.index)
-
- return new_blocks
-
-
-@ray.remote
-def _match_partitioning(column_partition, lengths, index):
- """Match the number of rows on each partition. Used in df.merge().
-
- NOTE: This function can cause problems when there are empty column
- partitions.
-
- The way this function is intended to be used is as follows: Align the
- right partitioning with the left. The left will remain unchanged. Then,
- you are free to perform actions on a per-partition basis with the
- partitioning.
-
- The index objects must already be identical for this to work correctly.
-
- Args:
- column_partition: The column partition to change.
- lengths: The lengths of each row partition to match to.
- index: The index index of the column_partition. This is used to push
- down to the inner frame for correctness in the merge.
-
- Returns:
- A list of blocks created from this column partition.
- """
- partitioned_list = []
-
- columns = column_partition.columns
- # We set this because this is the only place we can guarantee correct
- # placement. We use it in the case the user wants to join on the index.
- column_partition.index = index
- for length in lengths:
- if len(column_partition) == 0:
- partitioned_list.append(pandas.DataFrame(columns=columns))
- continue
-
- partitioned_list.append(column_partition.iloc[:length, :])
- column_partition = column_partition.iloc[length:, :]
- return partitioned_list
-
-
-@ray.remote
-def _concat_index(*index_parts):
- return index_parts[0].append(index_parts[1:])
-
-
-def fix_blocks_dimensions(blocks, axis):
- """Checks that blocks is 2D, and adds a dimension if not.
- """
- if blocks.ndim < 2:
- return np.expand_dims(blocks, axis=axis ^ 1)
- return blocks
-
-
-@ray.remote
-def _compile_remote_dtypes(*column_of_blocks):
- small_dfs = [df.loc[0:0] for df in column_of_blocks]
- return pandas.concat(small_dfs, copy=False).dtypes