mirror of
https://github.com/wassname/ray.git
synced 2026-07-01 08:19:30 +08:00
127 lines
4.7 KiB
Python
127 lines
4.7 KiB
Python
import ray.util.iter as parallel_it
|
|
import ray.util.data as ml_data
|
|
import pytest
|
|
|
|
import pyarrow as pa
|
|
import pyarrow.parquet as pq
|
|
import pandas as pd
|
|
import os
|
|
|
|
|
|
def test_read_parquet(ray_start_regular_shared, tmp_path):
|
|
df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
|
|
table = pa.Table.from_pandas(df1)
|
|
pq.write_table(table, os.path.join(tmp_path, "test1.parquet"))
|
|
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
|
|
table = pa.Table.from_pandas(df2)
|
|
pq.write_table(table, os.path.join(tmp_path, "test2.parquet"))
|
|
|
|
# without columns
|
|
ds = ml_data.read_parquet(tmp_path, num_shards=2)
|
|
result = list(ds.gather_sync())
|
|
assert df1.equals(result[0])
|
|
assert df2.equals(result[1])
|
|
|
|
# with columns one
|
|
ds = ml_data.read_parquet(tmp_path, num_shards=2, columns=["one"])
|
|
result = list(ds.gather_sync())
|
|
assert df1[["one"]].equals(result[0])
|
|
assert df2[["one"]].equals(result[1])
|
|
|
|
# with columns two
|
|
ds = ml_data.read_parquet(tmp_path, num_shards=2, columns=["two"])
|
|
result = list(ds.gather_sync())
|
|
assert df1[["two"]].equals(result[0])
|
|
assert df2[["two"]].equals(result[1])
|
|
|
|
|
|
def test_from_parallel_it(ray_start_regular_shared):
|
|
para_it = parallel_it.from_range(4).for_each(lambda x: [x])
|
|
ds = ml_data.from_parallel_iter(para_it, batch_size=2)
|
|
assert repr(ds) == ("MLDataset[from_range[4, shards=2]"
|
|
".for_each().batch(2).to_pandas()]")
|
|
collected = list(ds.gather_sync())
|
|
assert len(collected) == 2
|
|
assert all(d.shape == (2, 1) for d in collected)
|
|
expected = para_it.flatten().batch(2).gather_sync().flatten()
|
|
flattened = ds.gather_sync().for_each(lambda x: x[0].to_list()).flatten()
|
|
assert list(flattened) == list(expected)
|
|
|
|
|
|
def test_batch(ray_start_regular_shared):
|
|
para_it = parallel_it.from_range(16).for_each(lambda x: [x])
|
|
ds = ml_data.from_parallel_iter(para_it, batch_size=2)
|
|
collected = list(ds.gather_sync())
|
|
assert len(collected) == 8
|
|
assert all(d.shape == (2, 1) for d in collected)
|
|
|
|
ds = ds.batch(4)
|
|
assert repr(ds) == ("MLDataset[from_range[16, shards=2]"
|
|
".for_each().batch(2).to_pandas().batch(4)]")
|
|
collected = list(ds.gather_sync())
|
|
assert len(collected) == 4
|
|
assert all(d.shape == (4, 1) for d in collected)
|
|
expected = para_it.flatten().batch(4).gather_sync().flatten()
|
|
flattened = ds.gather_sync().for_each(lambda x: x[0].to_list()).flatten()
|
|
assert list(flattened) == list(expected)
|
|
|
|
|
|
def test_local_shuffle(ray_start_regular_shared):
|
|
para_it = parallel_it.from_range(100).for_each(lambda x: [x])
|
|
|
|
# batch_size larger than 1 and shuffle_buffer_size larger than 1
|
|
ds = ml_data.from_parallel_iter(para_it, batch_size=10)
|
|
ds1 = ds.local_shuffle(shuffle_buffer_size=5)
|
|
ds2 = ds.local_shuffle(shuffle_buffer_size=5)
|
|
|
|
l1 = list(ds1.gather_sync())
|
|
l2 = list(ds2.gather_sync())
|
|
assert not all(df1.equals(df2) for df1, df2 in zip(l1, l2))
|
|
|
|
# batch_size equals 1 and shuffle_buffer_size larger than 1
|
|
ds = ml_data.from_parallel_iter(para_it, batch_size=1)
|
|
ds1 = ds.local_shuffle(shuffle_buffer_size=5)
|
|
ds2 = ds.local_shuffle(shuffle_buffer_size=5)
|
|
|
|
l1 = list(ds1.gather_sync())
|
|
l2 = list(ds2.gather_sync())
|
|
assert not all(df1.equals(df2) for df1, df2 in zip(l1, l2))
|
|
|
|
# batch_size equals 1 and shuffle_buffer_size equals 1
|
|
ds = ml_data.from_parallel_iter(para_it, batch_size=1)
|
|
ds1 = ds.local_shuffle(shuffle_buffer_size=1)
|
|
ds2 = ds.local_shuffle(shuffle_buffer_size=1)
|
|
|
|
l1 = list(ds1.gather_sync())
|
|
l2 = list(ds2.gather_sync())
|
|
assert all(df1.equals(df2) for df1, df2 in zip(l1, l2))
|
|
|
|
|
|
def test_union(ray_start_regular_shared):
|
|
para_it1 = parallel_it.from_range(4, 2, False).for_each(lambda x: [x])
|
|
ds1 = ml_data.from_parallel_iter(para_it1, True, 2, False)
|
|
para_it2 = parallel_it.from_range(4, 2, True).for_each(lambda x: [x])
|
|
ds2 = ml_data.from_parallel_iter(para_it2, True, 2, True)
|
|
|
|
with pytest.raises(TypeError) as ex:
|
|
ds1.union(ds2)
|
|
assert "two MLDataset which have different repeated type" in str(ex.value)
|
|
|
|
# union two MLDataset with same batch size
|
|
para_it2 = parallel_it.from_range(4, 2, False).for_each(lambda x: [x])
|
|
ds2 = ml_data.from_parallel_iter(para_it2, True, 2, False)
|
|
ds = ds1.union(ds2)
|
|
assert ds.batch_size == 2
|
|
|
|
# union two MLDataset with different batch size
|
|
para_it2 = parallel_it.from_range(4, 2, False).for_each(lambda x: [x])
|
|
ds2 = ml_data.from_parallel_iter(para_it2, True, 1, False)
|
|
ds = ds1.union(ds2)
|
|
# batch_size 0 means batch_size unknown
|
|
assert ds.batch_size == 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
sys.exit(pytest.main(["-v", __file__]))
|