Files
ray/python/ray/dataframe/io.py
T
Kunal Gosar 8560993b46 [Dataframe] Change pandas and ray.dataframe imports (#1942)
* fixing zero length partitions

* fixing bugs to fully handle zero len parts

* resolve comments

* renaming imports
2018-06-15 16:17:16 -07:00

525 lines
16 KiB
Python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from itertools import chain
from io import BytesIO
import os
import re
import warnings
import pandas
from pyarrow.parquet import ParquetFile
from pandas.io.common import _infer_compression # don't depend on internal API
from .dataframe import ray, DataFrame
from . import get_npartitions
from .utils import from_pandas
# Parquet
def read_parquet(path, engine='auto', columns=None, **kwargs):
"""Load a parquet object from the file path, returning a DataFrame.
Ray DataFrame only supports pyarrow engine for now.
Args:
path: The filepath of the parquet file.
We only support local files for now.
engine: Ray only support pyarrow reader.
This argument doesn't do anything for now.
kwargs: Pass into parquet's read_row_group function.
"""
pf = ParquetFile(path)
n_rows = pf.metadata.num_rows
chunksize = n_rows // get_npartitions()
n_row_groups = pf.metadata.num_row_groups
idx_regex = re.compile('__index_level_\d+__')
columns = [
name for name in pf.metadata.schema.names if not idx_regex.match(name)
]
df_from_row_groups = [
_read_parquet_row_group.remote(path, columns, i, kwargs)
for i in range(n_row_groups)
]
splited_dfs = ray.get(
[_split_df.remote(df, chunksize) for df in df_from_row_groups])
df_remotes = list(chain.from_iterable(splited_dfs))
return DataFrame(row_partitions=df_remotes, columns=columns)
@ray.remote
def _read_parquet_row_group(path, columns, row_group_id, kwargs={}):
"""Read a parquet row_group given file_path.
"""
pf = ParquetFile(path)
df = pf.read_row_group(row_group_id, columns=columns, **kwargs).to_pandas()
return df
@ray.remote
def _split_df(pandas_df, chunksize):
"""Split a pandas_df into partitions.
Returns:
remote_df_ids ([ObjectID])
"""
dataframes = []
while len(pandas_df) > chunksize:
t_df = pandas_df[:chunksize]
t_df.reset_index(drop=True)
top = ray.put(t_df)
dataframes.append(top)
pandas_df = pandas_df[chunksize:]
else:
pandas_df = pandas_df.reset_index(drop=True)
dataframes.append(ray.put(pandas_df))
return dataframes
# CSV
def _compute_offset(fn, npartitions, ignore_first_line=False):
"""
Calculate the currect bytes offsets for a csv file.
Return a list of (start, end) tuple where the end == \n or EOF.
"""
total_bytes = os.path.getsize(fn)
bio = open(fn, 'rb')
if ignore_first_line:
start = len(bio.readline())
chunksize = (total_bytes - start) // npartitions
else:
start = 0
chunksize = total_bytes // npartitions
if chunksize == 0:
chunksize = 1
offsets = []
while start < total_bytes:
bio.seek(chunksize, 1) # Move forward {chunksize} bytes
extend_line = bio.readline() # Move after the next \n
total_offset = chunksize + len(extend_line)
# The position of the \n we just crossed.
new_line_cursor = start + total_offset - 1
offsets.append((start, new_line_cursor))
start = new_line_cursor + 1
bio.close()
return offsets
def _get_firstline(file_path):
bio = open(file_path, 'rb')
first = bio.readline()
bio.close()
return first
def _infer_column(first_line, kwargs={}):
return pandas.read_csv(BytesIO(first_line), **kwargs).columns
@ray.remote
def _read_csv_with_offset(fn, start, end, kwargs={}, header=b''):
kwargs["quoting"] = int(kwargs["quoting"]) # See issue #2078
bio = open(fn, 'rb')
bio.seek(start)
to_read = header + bio.read(end - start)
bio.close()
pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs)
index = pandas_df.index
# Partitions must have RangeIndex
pandas_df.index = pandas.RangeIndex(0, len(pandas_df))
return pandas_df, index
@ray.remote
def get_index(*partition_indices):
return partition_indices[0].append(partition_indices[1:])
def read_csv(filepath_or_buffer,
sep=',',
delimiter=None,
header='infer',
names=None,
index_col=None,
usecols=None,
squeeze=False,
prefix=None,
mangle_dupe_cols=True,
dtype=None,
engine=None,
converters=None,
true_values=None,
false_values=None,
skipinitialspace=False,
skiprows=None,
nrows=None,
na_values=None,
keep_default_na=True,
na_filter=True,
verbose=False,
skip_blank_lines=True,
parse_dates=False,
infer_datetime_format=False,
keep_date_col=False,
date_parser=None,
dayfirst=False,
iterator=False,
chunksize=None,
compression='infer',
thousands=None,
decimal=b'.',
lineterminator=None,
quotechar='"',
quoting=0,
escapechar=None,
comment=None,
encoding=None,
dialect=None,
tupleize_cols=None,
error_bad_lines=True,
warn_bad_lines=True,
skipfooter=0,
skip_footer=0,
doublequote=True,
delim_whitespace=False,
as_recarray=None,
compact_ints=None,
use_unsigned=None,
low_memory=True,
buffer_lines=None,
memory_map=False,
float_precision=None):
"""Read csv file from local disk.
Args:
filepath:
The filepath of the csv file.
We only support local files for now.
kwargs: Keyword arguments in pandas::from_csv
"""
kwargs = {
'sep': sep,
'delimiter': delimiter,
'header': header,
'names': names,
'index_col': index_col,
'usecols': usecols,
'squeeze': squeeze,
'prefix': prefix,
'mangle_dupe_cols': mangle_dupe_cols,
'dtype': dtype,
'engine': engine,
'converters': converters,
'true_values': true_values,
'false_values': false_values,
'skipinitialspace': skipinitialspace,
'skiprows': skiprows,
'nrows': nrows,
'na_values': na_values,
'keep_default_na': keep_default_na,
'na_filter': na_filter,
'verbose': verbose,
'skip_blank_lines': skip_blank_lines,
'parse_dates': parse_dates,
'infer_datetime_format': infer_datetime_format,
'keep_date_col': keep_date_col,
'date_parser': date_parser,
'dayfirst': dayfirst,
'iterator': iterator,
'chunksize': chunksize,
'compression': compression,
'thousands': thousands,
'decimal': decimal,
'lineterminator': lineterminator,
'quotechar': quotechar,
'quoting': quoting,
'escapechar': escapechar,
'comment': comment,
'encoding': encoding,
'dialect': dialect,
'tupleize_cols': tupleize_cols,
'error_bad_lines': error_bad_lines,
'warn_bad_lines': warn_bad_lines,
'skipfooter': skipfooter,
'skip_footer': skip_footer,
'doublequote': doublequote,
'delim_whitespace': delim_whitespace,
'as_recarray': as_recarray,
'compact_ints': compact_ints,
'use_unsigned': use_unsigned,
'low_memory': low_memory,
'buffer_lines': buffer_lines,
'memory_map': memory_map,
'float_precision': float_precision,
}
# Default to Pandas read_csv for non-serializable objects
if not isinstance(filepath_or_buffer, str) or \
_infer_compression(filepath_or_buffer, compression) is not None:
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
pandas_obj = pandas.read_csv(filepath_or_buffer, **kwargs)
if isinstance(pandas_obj, pandas.DataFrame):
return from_pandas(pandas_obj, get_npartitions())
return pandas_obj
filepath = filepath_or_buffer
# TODO: handle case where header is a list of lines
first_line = _get_firstline(filepath)
columns = _infer_column(first_line, kwargs=kwargs)
if header is None or (header == "infer" and names is not None):
first_line = b""
ignore_first_line = False
else:
ignore_first_line = True
offsets = _compute_offset(filepath, get_npartitions(),
ignore_first_line=ignore_first_line)
# Serialize objects to speed up later use in remote tasks
first_line_id = ray.put(first_line)
kwargs_id = ray.put(kwargs)
df_obj_ids = []
index_obj_ids = []
for start, end in offsets:
if start != 0:
df, index = _read_csv_with_offset._submit(
args=(filepath, start, end, kwargs_id, first_line_id),
num_return_vals=2)
else:
df, index = _read_csv_with_offset._submit(
args=(filepath, start, end, kwargs_id),
num_return_vals=2)
df_obj_ids.append(df)
index_obj_ids.append(index)
index = get_index.remote(*index_obj_ids) if index_col is not None else None
return DataFrame(row_partitions=df_obj_ids, columns=columns, index=index)
def read_json(path_or_buf=None,
orient=None,
typ='frame',
dtype=True,
convert_axes=True,
convert_dates=True,
keep_default_dates=True,
numpy=False,
precise_float=False,
date_unit=None,
encoding=None,
lines=False,
chunksize=None,
compression='infer'):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_json(path_or_buf, orient, typ, dtype,
convert_axes, convert_dates,
keep_default_dates, numpy, precise_float,
date_unit, encoding, lines, chunksize,
compression)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_html(io,
match='.+',
flavor=None,
header=None,
index_col=None,
skiprows=None,
attrs=None,
parse_dates=False,
tupleize_cols=None,
thousands=',',
encoding=None,
decimal='.',
converters=None,
na_values=None,
keep_default_na=True):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_html(io, match, flavor, header, index_col,
skiprows, attrs, parse_dates, tupleize_cols,
thousands, encoding, decimal, converters,
na_values, keep_default_na)
ray_frame = from_pandas(port_frame[0], get_npartitions())
return ray_frame
def read_clipboard(sep=r'\s+'):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_clipboard(sep)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_excel(io,
sheet_name=0,
header=0,
skiprows=None,
skip_footer=0,
index_col=None,
names=None,
usecols=None,
parse_dates=False,
date_parser=None,
na_values=None,
thousands=None,
convert_float=True,
converters=None,
dtype=None,
true_values=None,
false_values=None,
engine=None,
squeeze=False):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_excel(io, sheet_name, header, skiprows,
skip_footer, index_col, names, usecols,
parse_dates, date_parser, na_values,
thousands, convert_float, converters, dtype,
true_values, false_values, engine, squeeze)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_hdf(path_or_buf,
key=None,
mode='r'):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_hdf(path_or_buf, key, mode)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_feather(path,
nthreads=1):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_feather(path)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_msgpack(path_or_buf,
encoding='utf-8',
iterator=False):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_msgpack(path_or_buf, encoding, iterator)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_stata(filepath_or_buffer,
convert_dates=True,
convert_categoricals=True,
encoding=None,
index_col=None,
convert_missing=False,
preserve_dtypes=True,
columns=None,
order_categoricals=True,
chunksize=None,
iterator=False):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_stata(filepath_or_buffer, convert_dates,
convert_categoricals, encoding, index_col,
convert_missing, preserve_dtypes, columns,
order_categoricals, chunksize, iterator)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_sas(filepath_or_buffer,
format=None,
index=None,
encoding=None,
chunksize=None,
iterator=False):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_sas(filepath_or_buffer, format, index, encoding,
chunksize, iterator)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_pickle(path,
compression='infer'):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_pickle(path, compression)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame
def read_sql(sql,
con,
index_col=None,
coerce_float=True,
params=None,
parse_dates=None,
columns=None,
chunksize=None):
warnings.warn("Defaulting to Pandas implementation",
PendingDeprecationWarning)
port_frame = pandas.read_sql(sql, con, index_col, coerce_float, params,
parse_dates, columns, chunksize)
ray_frame = from_pandas(port_frame, get_npartitions())
return ray_frame