Initial Skeleton for Streaming API (#4126)

2026-07-02 03:50:57 +08:00 · 2019-02-26 12:15:08 -08:00
parent 62055cc01c
commit 89ce4c56aa
17 changed files with 2461 additions and 4 deletions
@@ -0,0 +1,66 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pytest
+import time
+
+import ray
+from ray.experimental.streaming.batched_queue import BatchedQueue
+
+
+@pytest.fixture
+def ray_start():
+    # Start the Ray processes.
+    ray.init(num_cpus=2)
+    yield None
+    # The code after the yield will run as teardown code.
+    ray.shutdown()
+
+
+@ray.remote
+class Reader(object):
+    def __init__(self, queue):
+        self.queue = queue
+        self.num_reads = 0
+        self.start = time.time()
+
+    def read(self, read_slowly):
+        expected_value = 0
+        for _ in range(1000):
+            x = self.queue.read_next()
+            assert x == expected_value, (x, expected_value)
+            expected_value += 1
+            self.num_reads += 1
+            if read_slowly:
+                time.sleep(0.001)
+
+
+def test_batched_queue(ray_start):
+    # Batched queue parameters
+    max_queue_size = 10000  # Max number of batches in queue
+    max_batch_size = 1000  # Max number of elements per batch
+    batch_timeout = 0.001  # 1ms flush timeout
+    prefetch_depth = 10  # Number of batches to prefetch from plasma
+    background_flush = False  # Don't use daemon thread for flushing
+    # Two tests: one with a big queue and slow reader, and
+    # a second one with a small queue and a faster reader
+    for read_slowly in [True, False]:
+        # Construct the batched queue
+        queue = BatchedQueue(
+            max_size=max_queue_size,
+            max_batch_size=max_batch_size,
+            max_batch_time=batch_timeout,
+            prefetch_depth=prefetch_depth,
+            background_flush=background_flush)
+        # Create and start the reader
+        reader = Reader.remote(queue)
+        object_id = reader.read.remote(read_slowly=read_slowly)
+        value = 0
+        for _ in range(1000):
+            queue.put_next(value)
+            value += 1
+        queue._flush_writes()
+        ray.get(object_id)
+        # Test once more with a very small queue size and a faster reader
+        max_queue_size = 10
@@ -0,0 +1,204 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.experimental.streaming.streaming import Environment
+from ray.experimental.streaming.operator import OpType, PStrategy
+
+
+def test_parallelism():
+    """Tests operator parallelism."""
+    env = Environment()
+    # Try setting a common parallelism for all operators
+    env.set_parallelism(2)
+    stream = env.source(None).map(None).filter(None).flat_map(None)
+    env._collect_garbage()
+    for operator in env.operators.values():
+        if operator.type == OpType.Source:
+            # TODO (john): Currently each source has only one instance
+            assert operator.num_instances == 1, (operator.num_instances, 1)
+        else:
+            assert operator.num_instances == 2, (operator.num_instances, 2)
+    # Check again after adding an operator with different parallelism
+    stream.map(None, "Map1").shuffle().set_parallelism(3).map(
+        None, "Map2").set_parallelism(4)
+    env._collect_garbage()
+    for operator in env.operators.values():
+        if operator.type == OpType.Source:
+            assert operator.num_instances == 1, (operator.num_instances, 1)
+        elif operator.name != "Map1" and operator.name != "Map2":
+            assert operator.num_instances == 2, (operator.num_instances, 2)
+        elif operator.name != "Map2":
+            assert operator.num_instances == 3, (operator.num_instances, 3)
+        else:
+            assert operator.num_instances == 4, (operator.num_instances, 4)
+
+
+def test_partitioning():
+    """Tests stream partitioning."""
+    env = Environment()
+    # Try defining multiple partitioning strategies for the same stream
+    _ = env.source(None).shuffle().rescale().broadcast().map(
+        None).broadcast().shuffle()
+    env._collect_garbage()
+    for operator in env.operators.values():
+        p_schemes = operator.partitioning_strategies
+        for scheme in p_schemes.values():
+            # Only last defined strategy should be kept
+            if operator.type == OpType.Source:
+                assert scheme.strategy == PStrategy.Broadcast, (
+                    scheme.strategy, PStrategy.Broadcast)
+            else:
+                assert scheme.strategy == PStrategy.Shuffle, (
+                    scheme.strategy, PStrategy.Shuffle)
+
+
+def test_forking():
+    """Tests stream forking."""
+    env = Environment()
+    # Try forking a stream
+    stream = env.source(None).map(None).set_parallelism(2)
+    # First branch with a shuffle partitioning strategy
+    _ = stream.shuffle().key_by(0).sum(1)
+    # Second branch with the default partitioning strategy
+    _ = stream.key_by(1).sum(2)
+    env._collect_garbage()
+    # Operator ids
+    source_id = None
+    map_id = None
+    keyby1_id = None
+    keyby2_id = None
+    sum1_id = None
+    sum2_id = None
+    # Collect ids
+    for id, operator in env.operators.items():
+        if operator.type == OpType.Source:
+            source_id = id
+        elif operator.type == OpType.Map:
+            map_id = id
+        elif operator.type == OpType.KeyBy:
+            if operator.other_args == 0:
+                keyby1_id = id
+            else:
+                assert operator.other_args == 1, (operator.other_args, 1)
+                keyby2_id = id
+        elif operator.type == OpType.Sum:
+            if operator.other_args == 1:
+                sum1_id = id
+            else:
+                assert operator.other_args == 2, (operator.other_args, 2)
+                sum2_id = id
+    # Check generated streams and their partitioning
+    for source, destination in env.logical_topo.edges:
+        operator = env.operators[source]
+        if source == source_id:
+            assert destination == map_id, (destination, map_id)
+        elif source == map_id:
+            p_scheme = operator.partitioning_strategies[destination]
+            strategy = p_scheme.strategy
+            key_index = env.operators[destination].other_args
+            if key_index == 0:  # This must be the first branch
+                assert strategy == PStrategy.Shuffle, (strategy,
+                                                       PStrategy.Shuffle)
+                assert destination == keyby1_id, (destination, keyby1_id)
+            else:  # This must be the second branch
+                assert key_index == 1, (key_index, 1)
+                assert strategy == PStrategy.Forward, (strategy,
+                                                       PStrategy.Forward)
+                assert destination == keyby2_id, (destination, keyby2_id)
+        elif source == keyby1_id or source == keyby2_id:
+            p_scheme = operator.partitioning_strategies[destination]
+            strategy = p_scheme.strategy
+            key_index = env.operators[destination].other_args
+            if key_index == 1:  # This must be the first branch
+                assert strategy == PStrategy.ShuffleByKey, (
+                    strategy, PStrategy.ShuffleByKey)
+                assert destination == sum1_id, (destination, sum1_id)
+            else:  # This must be the second branch
+                assert key_index == 2, (key_index, 2)
+                assert strategy == PStrategy.ShuffleByKey, (
+                    strategy, PStrategy.ShuffleByKey)
+                assert destination == sum2_id, (destination, sum2_id)
+        else:  # This must be a sum operator
+            assert operator.type == OpType.Sum, (operator.type, OpType.Sum)
+
+
+def _test_shuffle_channels():
+    """Tests shuffling connectivity."""
+    env = Environment()
+    # Try defining a shuffle
+    _ = env.source(None).shuffle().map(None).set_parallelism(4)
+    expected = [(0, 0), (0, 1), (0, 2), (0, 3)]
+    _test_channels(env, expected)
+
+
+def _test_forward_channels():
+    """Tests forward connectivity."""
+    env = Environment()
+    # Try the default partitioning strategy
+    _ = env.source(None).set_parallelism(4).map(None).set_parallelism(2)
+    expected = [(0, 0), (1, 1), (2, 0), (3, 1)]
+    _test_channels(env, expected)
+
+
+def _test_broadcast_channels():
+    """Tests broadcast connectivity."""
+    env = Environment()
+    # Try broadcasting
+    _ = env.source(None).set_parallelism(4).broadcast().map(
+        None).set_parallelism(2)
+    expected = [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1), (3, 0), (3, 1)]
+    _test_channels(env, expected)
+
+
+def _test_round_robin_channels():
+    """Tests round-robin connectivity."""
+    env = Environment()
+    # Try broadcasting
+    _ = env.source(None).round_robin().map(None).set_parallelism(2)
+    expected = [(0, 0), (0, 1)]
+    _test_channels(env, expected)
+
+
+def _test_channels(environment, expected_channels):
+    """Tests operator connectivity."""
+    environment._collect_garbage()
+    map_id = None
+    # Get id
+    for id, operator in environment.operators.items():
+        if operator.type == OpType.Map:
+            map_id = id
+    # Collect channels
+    channels_per_destination = []
+    for operator in environment.operators.values():
+        channels_per_destination.append(
+            environment._generate_channels(operator))
+    # Check actual connectivity
+    actual = []
+    for destination in channels_per_destination:
+        for channels in destination.values():
+            for channel in channels:
+                src_instance_id = channel.src_instance_id
+                dst_instance_id = channel.dst_instance_id
+                connection = (src_instance_id, dst_instance_id)
+                assert channel.dst_operator_id == map_id, (
+                    channel.dst_operator_id, map_id)
+                actual.append(connection)
+    # Make sure connections are as expected
+    set_1 = set(expected_channels)
+    set_2 = set(actual)
+    assert set_1 == set_2, (set_1, set_2)
+
+
+def test_channel_generation():
+    """Tests data channel generation."""
+    _test_shuffle_channels()
+    _test_broadcast_channels()
+    _test_round_robin_channels()
+    _test_forward_channels()
+
+
+# TODO (john): Add simple wordcount test
+def test_wordcount():
+    """Tests a simple streaming wordcount."""
+    pass