[DataFrame] Adding read methods and tests (#1712)

* Adding read methods and tests * Referencing internal partition method so constructors are more canonical with Pandas * Fixing to reference from_pandas in utils * Cleaning up unused imports * rerunning tests * fixing flake8 * resolving errors * Added sql and sas test * updating * Temporarily phasing out read_csv code for wrapper while diagnosing, added io tests to travis * Adding travis * restoring distributed read csv * resolving rebases * lint * Sampling out HD test * adding dep * fix pathing * Flagging out tests * resolving read_method issues * fix build issue * move additional dependencies to extras * fixing lint * removing IO dependencies * updated requirements doc
2026-06-29 18:44:36 +08:00 · 2018-04-20 18:33:08 -07:00
parent cffda73da1
commit 3c48783a16
4 changed files with 518 additions and 11 deletions
@@ -5,13 +5,23 @@ from __future__ import print_function
 import pytest
 import numpy as np
 import pandas as pd
+from ray.dataframe.utils import to_pandas
 import ray.dataframe.io as io
 import os
-
-from ray.dataframe.utils import to_pandas
+import sqlite3

 TEST_PARQUET_FILENAME = 'test.parquet'
 TEST_CSV_FILENAME = 'test.csv'
+TEST_JSON_FILENAME = 'test.json'
+TEST_HTML_FILENAME = 'test.html'
+TEST_EXCEL_FILENAME = 'test.xlsx'
+TEST_FEATHER_FILENAME = 'test.feather'
+TEST_HDF_FILENAME = 'test.hdf'
+TEST_MSGPACK_FILENAME = 'test.msg'
+TEST_STATA_FILENAME = 'test.dta'
+TEST_PICKLE_FILENAME = 'test.pkl'
+TEST_SAS_FILENAME = os.getcwd() + '/data/test1.sas7bdat'
+TEST_SQL_FILENAME = 'test.db'
 SMALL_ROW_SIZE = 2000
 LARGE_ROW_SIZE = 7e6

@@ -57,6 +67,178 @@ def teardown_csv_file():
        os.remove(TEST_CSV_FILENAME)


+@pytest.fixture
+def setup_json_file(row_size, force=False):
+    if os.path.exists(TEST_JSON_FILENAME) and not force:
+        pass
+    else:
+        df = pd.DataFrame({
+            'col1': np.arange(row_size),
+            'col2': np.arange(row_size)
+        })
+        df.to_json(TEST_JSON_FILENAME)
+
+
+@pytest.fixture
+def teardown_json_file():
+    if os.path.exists(TEST_JSON_FILENAME):
+        os.remove(TEST_JSON_FILENAME)
+
+
+@pytest.fixture
+def setup_html_file(row_size, force=False):
+    if os.path.exists(TEST_HTML_FILENAME) and not force:
+        pass
+    else:
+        df = pd.DataFrame({
+            'col1': np.arange(row_size),
+            'col2': np.arange(row_size)
+        })
+        df.to_html(TEST_HTML_FILENAME)
+
+
+@pytest.fixture
+def teardown_html_file():
+    if os.path.exists(TEST_HTML_FILENAME):
+        os.remove(TEST_HTML_FILENAME)
+
+
+@pytest.fixture
+def setup_clipboard(row_size, force=False):
+    df = pd.DataFrame({
+        'col1': np.arange(row_size),
+        'col2': np.arange(row_size)
+    })
+    df.to_clipboard()
+
+
+@pytest.fixture
+def setup_excel_file(row_size, force=False):
+    if os.path.exists(TEST_EXCEL_FILENAME) and not force:
+        pass
+    else:
+        df = pd.DataFrame({
+            'col1': np.arange(row_size),
+            'col2': np.arange(row_size)
+        })
+        df.to_excel(TEST_EXCEL_FILENAME)
+
+
+@pytest.fixture
+def teardown_excel_file():
+    if os.path.exists(TEST_EXCEL_FILENAME):
+        os.remove(TEST_EXCEL_FILENAME)
+
+
+@pytest.fixture
+def setup_feather_file(row_size, force=False):
+    if os.path.exists(TEST_FEATHER_FILENAME) and not force:
+        pass
+    else:
+        df = pd.DataFrame({
+            'col1': np.arange(row_size),
+            'col2': np.arange(row_size)
+        })
+        df.to_feather(TEST_FEATHER_FILENAME)
+
+
+@pytest.fixture
+def teardown_feather_file():
+    if os.path.exists(TEST_FEATHER_FILENAME):
+        os.remove(TEST_FEATHER_FILENAME)
+
+
+@pytest.fixture
+def setup_hdf_file(row_size, force=False):
+    if os.path.exists(TEST_HDF_FILENAME) and not force:
+        pass
+    else:
+        df = pd.DataFrame({
+            'col1': np.arange(row_size),
+            'col2': np.arange(row_size)
+        })
+        df.to_hdf(TEST_HDF_FILENAME, 'test')
+
+
+@pytest.fixture
+def teardown_hdf_file():
+    if os.path.exists(TEST_HDF_FILENAME):
+        os.remove(TEST_HDF_FILENAME)
+
+
+@pytest.fixture
+def setup_msgpack_file(row_size, force=False):
+    if os.path.exists(TEST_MSGPACK_FILENAME) and not force:
+        pass
+    else:
+        df = pd.DataFrame({
+            'col1': np.arange(row_size),
+            'col2': np.arange(row_size)
+        })
+        df.to_msgpack(TEST_MSGPACK_FILENAME)
+
+
+@pytest.fixture
+def teardown_msgpack_file():
+    if os.path.exists(TEST_MSGPACK_FILENAME):
+        os.remove(TEST_MSGPACK_FILENAME)
+
+
+@pytest.fixture
+def setup_stata_file(row_size, force=False):
+    if os.path.exists(TEST_STATA_FILENAME) and not force:
+        pass
+    else:
+        df = pd.DataFrame({
+            'col1': np.arange(row_size),
+            'col2': np.arange(row_size)
+        })
+        df.to_stata(TEST_STATA_FILENAME)
+
+
+@pytest.fixture
+def teardown_stata_file():
+    if os.path.exists(TEST_STATA_FILENAME):
+        os.remove(TEST_STATA_FILENAME)
+
+
+@pytest.fixture
+def setup_pickle_file(row_size, force=False):
+    if os.path.exists(TEST_PICKLE_FILENAME) and not force:
+        pass
+    else:
+        df = pd.DataFrame({
+            'col1': np.arange(row_size),
+            'col2': np.arange(row_size)
+        })
+        df.to_pickle(TEST_PICKLE_FILENAME)
+
+
+@pytest.fixture
+def teardown_pickle_file():
+    if os.path.exists(TEST_PICKLE_FILENAME):
+        os.remove(TEST_PICKLE_FILENAME)
+
+
+@pytest.fixture
+def setup_sql_file(conn, force=False):
+    if os.path.exists(TEST_SQL_FILENAME) and not force:
+        pass
+    else:
+        df = pd.DataFrame({'col1': [0, 1, 2, 3],
+                           'col2': [4, 5, 6, 7],
+                           'col3': [8, 9, 10, 11],
+                           'col4': [12, 13, 14, 15],
+                           'col5': [0, 0, 0, 0]})
+        df.to_sql(TEST_SQL_FILENAME.split(".")[0], conn)
+
+
+@pytest.fixture
+def teardown_sql_file():
+    if os.path.exists(TEST_SQL_FILENAME):
+        os.remove(TEST_SQL_FILENAME)
+
+
 def test_from_parquet_small():

    setup_parquet_file(SMALL_ROW_SIZE)
@@ -90,12 +272,120 @@ def test_from_csv():
    teardown_csv_file()


-def test_from_csv_delimiter():
-    setup_csv_file(SMALL_ROW_SIZE, delimiter='|')
+def test_from_json():
+    setup_json_file(SMALL_ROW_SIZE)

-    pd_df = pd.read_csv(TEST_CSV_FILENAME)
-    ray_df = io.read_csv(TEST_CSV_FILENAME)
+    pd_df = pd.read_json(TEST_JSON_FILENAME)
+    ray_df = io.read_json(TEST_JSON_FILENAME)

    assert ray_df_equals_pandas(ray_df, pd_df)

-    teardown_csv_file()
+    teardown_json_file()
+
+
+def test_from_html():
+    setup_html_file(SMALL_ROW_SIZE)
+
+    pd_df = pd.read_html(TEST_HTML_FILENAME)[0]
+    ray_df = io.read_html(TEST_HTML_FILENAME)
+
+    assert ray_df_equals_pandas(ray_df, pd_df)
+
+    teardown_html_file()
+
+
+@pytest.mark.skip(reason="No clipboard on Travis")
+def test_from_clipboard():
+    setup_clipboard(SMALL_ROW_SIZE)
+
+    pd_df = pd.read_clipboard()
+    ray_df = io.read_clipboard()
+
+    assert ray_df_equals_pandas(ray_df, pd_df)
+
+
+def test_from_excel():
+    setup_excel_file(SMALL_ROW_SIZE)
+
+    pd_df = pd.read_excel(TEST_EXCEL_FILENAME)
+    ray_df = io.read_excel(TEST_EXCEL_FILENAME)
+
+    assert ray_df_equals_pandas(ray_df, pd_df)
+
+    teardown_excel_file()
+
+
+def test_from_feather():
+    setup_feather_file(SMALL_ROW_SIZE)
+
+    pd_df = pd.read_feather(TEST_FEATHER_FILENAME)
+    ray_df = io.read_feather(TEST_FEATHER_FILENAME)
+
+    assert ray_df_equals_pandas(ray_df, pd_df)
+
+    teardown_feather_file()
+
+
+@pytest.mark.skip(reason="Memory overflow on Travis")
+def test_from_hdf():
+    setup_hdf_file(SMALL_ROW_SIZE)
+
+    pd_df = pd.read_hdf(TEST_HDF_FILENAME, key='test')
+    ray_df = io.read_hdf(TEST_HDF_FILENAME, key='test')
+
+    assert ray_df_equals_pandas(ray_df, pd_df)
+
+    teardown_hdf_file()
+
+
+def test_from_msgpack():
+    setup_msgpack_file(SMALL_ROW_SIZE)
+
+    pd_df = pd.read_msgpack(TEST_MSGPACK_FILENAME)
+    ray_df = io.read_msgpack(TEST_MSGPACK_FILENAME)
+
+    assert ray_df_equals_pandas(ray_df, pd_df)
+
+    teardown_msgpack_file()
+
+
+def test_from_stata():
+    setup_stata_file(SMALL_ROW_SIZE)
+
+    pd_df = pd.read_stata(TEST_STATA_FILENAME)
+    ray_df = io.read_stata(TEST_STATA_FILENAME)
+
+    assert ray_df_equals_pandas(ray_df, pd_df)
+
+    teardown_stata_file()
+
+
+def test_from_pickle():
+    setup_pickle_file(SMALL_ROW_SIZE)
+
+    pd_df = pd.read_pickle(TEST_PICKLE_FILENAME)
+    ray_df = io.read_pickle(TEST_PICKLE_FILENAME)
+
+    assert ray_df_equals_pandas(ray_df, pd_df)
+
+    teardown_pickle_file()
+
+
+def test_from_sql():
+    conn = sqlite3.connect(TEST_SQL_FILENAME)
+    setup_sql_file(conn, True)
+
+    pd_df = pd.read_sql("select * from test", conn)
+    ray_df = io.read_sql("select * from test", conn)
+
+    assert ray_df_equals_pandas(ray_df, pd_df)
+
+    teardown_sql_file()
+
+
+@pytest.mark.skip(reason="No SAS write methods in Pandas")
+def test_from_sas():
+    pd_df = pd.read_sas(TEST_SAS_FILENAME)
+    ray_df = io.read_sas(TEST_SAS_FILENAME)
+
+    assert ray_df_equals_pandas(ray_df, pd_df)