Files
ray/python/ray/dataframe/test/test_io.py
T
Simon Mo d78a22f94c [DataFrame] Implement IO for ray_df (#1599)
* Add parquet-cpp to gitignore

* Add read_csv and read_parquet

* Gitignore pytest_cache

* Fix flake8

* Add io to __init__

* Changing Index. Currently running tests, but so far untested.

* Removing issue of reassigning DF in from_pandas

* Fixing lint

* Fix bug

* Fix bug

* Fix bug

* Better performance

* Fixing index issue with sum

* Address comments

* Update io with index

* Updating performance and implementation. Adding tests

* Fixing off-by-1

* Fix lint

* Address Comments

* Make pop compatible with new to_pandas

* Format Code

* Cleanup some index issue

* Bug fix: assigned reset_index back

* Remove unused debug line
2018-02-26 18:26:38 -08:00

92 lines
2.1 KiB
Python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pytest
import numpy as np
import pandas as pd
import ray
import ray.dataframe as rdf
import ray.dataframe.io as io
import os
TEST_PARQUET_FILENAME = 'test.parquet'
TEST_CSV_FILENAME = 'test.csv'
SMALL_ROW_SIZE = 2000
LARGE_ROW_SIZE = 7e6
@pytest.fixture
def ray_df_equals_pandas(ray_df, pandas_df):
return rdf.to_pandas(ray_df).sort_index().equals(pandas_df.sort_index())
@pytest.fixture
def setup_parquet_file(row_size, force=False):
if os.path.exists(TEST_PARQUET_FILENAME) and not force:
pass
else:
df = pd.DataFrame({
'col1': np.arange(row_size),
'col2': np.arange(row_size)
})
df.to_parquet(TEST_PARQUET_FILENAME)
@pytest.fixture
def teardown_parquet_file():
if os.path.exists(TEST_PARQUET_FILENAME):
os.remove(TEST_PARQUET_FILENAME)
@pytest.fixture
def setup_csv_file(row_size, force=False):
if os.path.exists(TEST_CSV_FILENAME) and not force:
pass
else:
df = pd.DataFrame({
'col1': np.arange(row_size),
'col2': np.arange(row_size)
})
df.to_csv(TEST_CSV_FILENAME)
@pytest.fixture
def teardown_csv_file():
if os.path.exists(TEST_CSV_FILENAME):
os.remove(TEST_CSV_FILENAME)
def test_from_parquet_small():
ray.init()
setup_parquet_file(SMALL_ROW_SIZE)
pd_df = pd.read_parquet(TEST_PARQUET_FILENAME)
ray_df = io.read_parquet(TEST_PARQUET_FILENAME)
assert ray_df_equals_pandas(ray_df, pd_df)
teardown_parquet_file()
def test_from_parquet_large():
setup_parquet_file(LARGE_ROW_SIZE)
pd_df = pd.read_parquet(TEST_PARQUET_FILENAME)
ray_df = io.read_parquet(TEST_PARQUET_FILENAME)
assert ray_df_equals_pandas(ray_df, pd_df)
teardown_parquet_file()
def test_from_csv():
setup_csv_file(SMALL_ROW_SIZE)
pd_df = pd.read_csv(TEST_CSV_FILENAME)
ray_df = io.read_csv(TEST_CSV_FILENAME)
assert ray_df_equals_pandas(ray_df, pd_df)
teardown_csv_file()