mirror of
https://github.com/wassname/ray.git
synced 2026-07-02 17:40:17 +08:00
[DataFrame] Implement quantile (#1992)
* added quantile method * updated init for datetime signatures * updated documentation for _map_partitions return type * removed extraneous print call * updated for simplicity * fixed dtyping issues and error raising * updated datetime dtype checking * Fixing quantile implementation * Fix minor bug * Fixing diff
This commit is contained in:
committed by
Devin Petersohn
parent
ad1afeb268
commit
1848745223
@@ -3,7 +3,9 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (eval, unique, value_counts, Panel, date_range, MultiIndex)
|
||||
from pandas import (eval, MultiIndex, CategoricalIndex, Series, Panel, unique,
|
||||
value_counts, date_range, Timedelta, Timestamp,
|
||||
to_datetime, NaT)
|
||||
import threading
|
||||
|
||||
pd_version = pd.__version__
|
||||
@@ -29,7 +31,6 @@ def get_npartitions():
|
||||
# We import these file after above two function
|
||||
# because they depend on npartitions.
|
||||
from .dataframe import DataFrame # noqa: 402
|
||||
from .series import Series # noqa: 402
|
||||
from .io import (read_csv, read_parquet, read_json, read_html, # noqa: 402
|
||||
read_clipboard, read_excel, read_hdf, read_feather, # noqa: 402
|
||||
read_msgpack, read_stata, read_sas, read_pickle, # noqa: 402
|
||||
@@ -40,8 +41,9 @@ from .reshape import get_dummies # noqa: 402
|
||||
|
||||
__all__ = [
|
||||
"DataFrame", "Series", "read_csv", "read_parquet", "concat", "eval",
|
||||
"unique", "value_counts", "to_datetime", "get_dummies", "Panel",
|
||||
"date_range", "MultiIndex"
|
||||
"Panel", "unique", "get_dummies", "value_counts", "MultiIndex",
|
||||
"CategoricalIndex", "Timedelta", "Timestamp", "date_range", "to_datetime",
|
||||
"NaT"
|
||||
]
|
||||
|
||||
try:
|
||||
|
||||
@@ -363,7 +363,7 @@ class DataFrame(object):
|
||||
# We use the index to get the internal index.
|
||||
oid_series = [(oid_series[i], i) for i in range(len(oid_series))]
|
||||
|
||||
if len(oid_series) > 1:
|
||||
if len(oid_series) > 0:
|
||||
for df, partition in oid_series:
|
||||
this_partition = \
|
||||
self._col_metadata.partition_series(partition)
|
||||
@@ -3103,35 +3103,80 @@ class DataFrame(object):
|
||||
are the quantiles.
|
||||
"""
|
||||
|
||||
def quantile_helper(df, q, axis, numeric_only, interpolation):
|
||||
try:
|
||||
def check_bad_dtype(t):
|
||||
return t == np.dtype('O') or is_timedelta64_dtype(t)
|
||||
|
||||
if not numeric_only:
|
||||
# check if there are any object columns
|
||||
if all(check_bad_dtype(t) for t in self.dtypes):
|
||||
raise TypeError("can't multiply sequence by non-int of type "
|
||||
"'float'")
|
||||
else:
|
||||
if next((True for t in self.dtypes if check_bad_dtype(t)),
|
||||
False):
|
||||
dtype = next(t for t in self.dtypes if check_bad_dtype(t))
|
||||
raise ValueError("Cannot compare type '{}' with type '{}'"
|
||||
.format(type(dtype), float))
|
||||
else:
|
||||
# Normally pandas returns this near the end of the quantile, but we
|
||||
# can't afford the overhead of running the entire operation before
|
||||
# we error.
|
||||
if all(check_bad_dtype(t) for t in self.dtypes):
|
||||
raise ValueError("need at least one array to concatenate")
|
||||
|
||||
# check that all qs are between 0 and 1
|
||||
pd.DataFrame()._check_percentile(q)
|
||||
|
||||
def quantile_helper(df, base_object):
|
||||
"""Quantile to be run inside each partitoin.
|
||||
|
||||
Args:
|
||||
df: The DataFrame composing the partition.
|
||||
base_object: An empty pd.Series or pd.DataFrame depending on q.
|
||||
|
||||
Returns:
|
||||
A new Series or DataFrame depending on q.
|
||||
"""
|
||||
# This if call prevents ValueErrors with object only partitions
|
||||
if (numeric_only and
|
||||
all([dtype == np.dtype('O') or
|
||||
is_timedelta64_dtype(dtype)
|
||||
for dtype in df.dtypes])):
|
||||
return base_object
|
||||
else:
|
||||
return df.quantile(q=q, axis=axis, numeric_only=numeric_only,
|
||||
interpolation=interpolation)
|
||||
except ValueError:
|
||||
return pd.Series()
|
||||
|
||||
axis = pd.DataFrame()._get_axis_number(axis)
|
||||
|
||||
if isinstance(q, (pd.Series, np.ndarray, pd.Index, list)):
|
||||
# In the case of a list, we build it one at a time.
|
||||
# TODO Revisit for performance
|
||||
quantiles = []
|
||||
for q_i in q:
|
||||
def remote_func(df):
|
||||
return quantile_helper(df, q=q_i, axis=axis,
|
||||
numeric_only=numeric_only,
|
||||
interpolation=interpolation)
|
||||
|
||||
result = self._arithmetic_helper(remote_func, axis)
|
||||
result.name = q_i
|
||||
quantiles.append(result)
|
||||
q_index = pd.Float64Index(q)
|
||||
|
||||
if axis == 0:
|
||||
new_partitions = _map_partitions(
|
||||
lambda df: quantile_helper(df, pd.DataFrame()),
|
||||
self._col_partitions)
|
||||
|
||||
# select only correct dtype columns
|
||||
new_columns = self.dtypes[self.dtypes.apply(
|
||||
lambda x: is_numeric_dtype(x))].index
|
||||
|
||||
else:
|
||||
new_partitions = _map_partitions(
|
||||
lambda df: quantile_helper(df, pd.DataFrame()),
|
||||
self._row_partitions)
|
||||
new_columns = self.index
|
||||
|
||||
return DataFrame(col_partitions=new_partitions,
|
||||
index=q_index,
|
||||
columns=new_columns)
|
||||
|
||||
return pd.concat(quantiles, axis=1).T
|
||||
else:
|
||||
def remote_func(df):
|
||||
return quantile_helper(df, q=q, axis=axis,
|
||||
numeric_only=numeric_only,
|
||||
interpolation=interpolation)
|
||||
|
||||
result = self._arithmetic_helper(remote_func, axis)
|
||||
# When q is a single float, we return a Series, so using
|
||||
# arithmetic_helper works well here.
|
||||
result = self._arithmetic_helper(
|
||||
lambda df: quantile_helper(df, pd.Series()), axis)
|
||||
result.name = q
|
||||
return result
|
||||
|
||||
|
||||
@@ -140,7 +140,7 @@ def _map_partitions(func, partitions, *argslists):
|
||||
partitions ([ObjectID]): The list of partitions to map func on.
|
||||
|
||||
Returns:
|
||||
A new Dataframe containing the result of the function
|
||||
A list of partitions ([ObjectID]) with the result of the function
|
||||
"""
|
||||
if partitions is None:
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user