[DataFrame] Implement quantile (#1992)

* added quantile method

* updated init for datetime signatures

* updated documentation for _map_partitions return type

* removed extraneous print call

* updated for simplicity

* fixed dtyping issues and error raising

* updated datetime dtype checking

* Fixing quantile implementation

* Fix minor bug

* Fixing diff
This commit is contained in:
Rohan Singh
2018-05-06 18:25:13 -07:00
committed by Devin Petersohn
parent ad1afeb268
commit 1848745223
4 changed files with 75 additions and 29 deletions
+6 -4
View File
@@ -3,7 +3,9 @@ from __future__ import division
from __future__ import print_function
import pandas as pd
from pandas import (eval, unique, value_counts, Panel, date_range, MultiIndex)
from pandas import (eval, MultiIndex, CategoricalIndex, Series, Panel, unique,
value_counts, date_range, Timedelta, Timestamp,
to_datetime, NaT)
import threading
pd_version = pd.__version__
@@ -29,7 +31,6 @@ def get_npartitions():
# We import these file after above two function
# because they depend on npartitions.
from .dataframe import DataFrame # noqa: 402
from .series import Series # noqa: 402
from .io import (read_csv, read_parquet, read_json, read_html, # noqa: 402
read_clipboard, read_excel, read_hdf, read_feather, # noqa: 402
read_msgpack, read_stata, read_sas, read_pickle, # noqa: 402
@@ -40,8 +41,9 @@ from .reshape import get_dummies # noqa: 402
__all__ = [
"DataFrame", "Series", "read_csv", "read_parquet", "concat", "eval",
"unique", "value_counts", "to_datetime", "get_dummies", "Panel",
"date_range", "MultiIndex"
"Panel", "unique", "get_dummies", "value_counts", "MultiIndex",
"CategoricalIndex", "Timedelta", "Timestamp", "date_range", "to_datetime",
"NaT"
]
try:
+68 -23
View File
@@ -363,7 +363,7 @@ class DataFrame(object):
# We use the index to get the internal index.
oid_series = [(oid_series[i], i) for i in range(len(oid_series))]
if len(oid_series) > 1:
if len(oid_series) > 0:
for df, partition in oid_series:
this_partition = \
self._col_metadata.partition_series(partition)
@@ -3103,35 +3103,80 @@ class DataFrame(object):
are the quantiles.
"""
def quantile_helper(df, q, axis, numeric_only, interpolation):
try:
def check_bad_dtype(t):
return t == np.dtype('O') or is_timedelta64_dtype(t)
if not numeric_only:
# check if there are any object columns
if all(check_bad_dtype(t) for t in self.dtypes):
raise TypeError("can't multiply sequence by non-int of type "
"'float'")
else:
if next((True for t in self.dtypes if check_bad_dtype(t)),
False):
dtype = next(t for t in self.dtypes if check_bad_dtype(t))
raise ValueError("Cannot compare type '{}' with type '{}'"
.format(type(dtype), float))
else:
# Normally pandas returns this near the end of the quantile, but we
# can't afford the overhead of running the entire operation before
# we error.
if all(check_bad_dtype(t) for t in self.dtypes):
raise ValueError("need at least one array to concatenate")
# check that all qs are between 0 and 1
pd.DataFrame()._check_percentile(q)
def quantile_helper(df, base_object):
"""Quantile to be run inside each partitoin.
Args:
df: The DataFrame composing the partition.
base_object: An empty pd.Series or pd.DataFrame depending on q.
Returns:
A new Series or DataFrame depending on q.
"""
# This if call prevents ValueErrors with object only partitions
if (numeric_only and
all([dtype == np.dtype('O') or
is_timedelta64_dtype(dtype)
for dtype in df.dtypes])):
return base_object
else:
return df.quantile(q=q, axis=axis, numeric_only=numeric_only,
interpolation=interpolation)
except ValueError:
return pd.Series()
axis = pd.DataFrame()._get_axis_number(axis)
if isinstance(q, (pd.Series, np.ndarray, pd.Index, list)):
# In the case of a list, we build it one at a time.
# TODO Revisit for performance
quantiles = []
for q_i in q:
def remote_func(df):
return quantile_helper(df, q=q_i, axis=axis,
numeric_only=numeric_only,
interpolation=interpolation)
result = self._arithmetic_helper(remote_func, axis)
result.name = q_i
quantiles.append(result)
q_index = pd.Float64Index(q)
if axis == 0:
new_partitions = _map_partitions(
lambda df: quantile_helper(df, pd.DataFrame()),
self._col_partitions)
# select only correct dtype columns
new_columns = self.dtypes[self.dtypes.apply(
lambda x: is_numeric_dtype(x))].index
else:
new_partitions = _map_partitions(
lambda df: quantile_helper(df, pd.DataFrame()),
self._row_partitions)
new_columns = self.index
return DataFrame(col_partitions=new_partitions,
index=q_index,
columns=new_columns)
return pd.concat(quantiles, axis=1).T
else:
def remote_func(df):
return quantile_helper(df, q=q, axis=axis,
numeric_only=numeric_only,
interpolation=interpolation)
result = self._arithmetic_helper(remote_func, axis)
# When q is a single float, we return a Series, so using
# arithmetic_helper works well here.
result = self._arithmetic_helper(
lambda df: quantile_helper(df, pd.Series()), axis)
result.name = q
return result
+1 -1
View File
@@ -140,7 +140,7 @@ def _map_partitions(func, partitions, *argslists):
partitions ([ObjectID]): The list of partitions to map func on.
Returns:
A new Dataframe containing the result of the function
A list of partitions ([ObjectID]) with the result of the function
"""
if partitions is None:
return None