[Streaming] Streaming Python API (#6755)

2026-06-30 09:41:11 +08:00 · 2020-02-25 10:33:33 +08:00
parent 2c1f4fd82c
commit 8b6784de06
71 changed files with 2701 additions and 1928 deletions
@@ -1,67 +0,0 @@
-import argparse
-import logging
-import time
-
-import ray
-from ray.streaming.streaming import Environment
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--input-file", required=True, help="the input text file")
-
-
-# A class used to check attribute-based key selection
-class Record:
-    def __init__(self, record):
-        k, _ = record
-        self.word = k
-        self.record = record
-
-
-# Splits input line into words and outputs objects of type Record
-# each one consisting of a key (word) and a tuple (word,1)
-def splitter(line):
-    records = []
-    words = line.split()
-    for w in words:
-        records.append(Record((w, 1)))
-    return records
-
-
-# Receives an object of type Record and returns the actual tuple
-def as_tuple(record):
-    return record.record
-
-
-if __name__ == "__main__":
-    # Get program parameters
-    args = parser.parse_args()
-    input_file = str(args.input_file)
-
-    ray.init()
-    ray.register_custom_serializer(Record, use_dict=True)
-
-    # A Ray streaming environment with the default configuration
-    env = Environment()
-    env.set_parallelism(2)  # Each operator will be executed by two actors
-
-    # 'key_by("word")' physically partitions the stream of records
-    # based on the hash value of the 'word' attribute (see Record class above)
-    # 'map(as_tuple)' maps a record of type Record into a tuple
-    # 'sum(1)' sums the 2nd element of the tuple, i.e. the word count
-    stream = env.read_text_file(input_file) \
-                .round_robin() \
-                .flat_map(splitter) \
-                .key_by("word") \
-                .map(as_tuple) \
-                .sum(1) \
-                .inspect(print)     # Prints the content of the
-    # stream to stdout
-    start = time.time()
-    env_handle = env.execute()  # Deploys and executes the dataflow
-    ray.get(env_handle)  # Stay alive until execution finishes
-    end = time.time()
-    logger.info("Elapsed time: {} secs".format(end - start))
-    logger.debug("Output stream id: {}".format(stream.id))
@@ -1,52 +0,0 @@
-import argparse
-import logging
-import time
-
-import ray
-from ray.streaming.config import Config
-from ray.streaming.streaming import Environment, Conf
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--input-file", required=True, help="the input text file")
-
-
-# Test functions
-def splitter(line):
-    return line.split()
-
-
-def filter_fn(word):
-    if "f" in word:
-        return True
-    return False
-
-
-if __name__ == "__main__":
-
-    args = parser.parse_args()
-
-    ray.init(local_mode=False)
-
-    # A Ray streaming environment with the default configuration
-    env = Environment(config=Conf(channel_type=Config.NATIVE_CHANNEL))
-
-    # Stream represents the ouput of the filter and
-    # can be forked into other dataflows
-    stream = env.read_text_file(args.input_file) \
-        .shuffle() \
-        .flat_map(splitter) \
-        .set_parallelism(2) \
-        .filter(filter_fn) \
-        .set_parallelism(2) \
-        .inspect(lambda x: print("result", x))     # Prints the contents of the
-    # stream to stdout
-    start = time.time()
-    env_handle = env.execute()
-    ray.get(env_handle)  # Stay alive until execution finishes
-    env.wait_finish()
-    end = time.time()
-    logger.info("Elapsed time: {} secs".format(end - start))
-    logger.debug("Output stream id: {}".format(stream.id))
@@ -1,5 +0,0 @@
-This is
-a test file
-to test if example
-works
-fine
@@ -4,7 +4,8 @@ import time

 import ray
 import wikipedia
-from ray.streaming.streaming import Environment
+from ray.streaming import StreamingContext
+from ray.streaming.config import Config

 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -23,7 +24,6 @@ class Wikipedia:
    def __init__(self, title_file):
        # Titles in this file will be as queries
        self.title_file = title_file
-        # TODO (john): Handle possible exception here
        self.title_reader = iter(list(open(self.title_file, "r").readlines()))
        self.done = False
        self.article_done = True
@@ -57,21 +57,7 @@ class Wikipedia:
 # Splits input line into words and
 # outputs records of the form (word,1)
 def splitter(line):
-    records = []
-    words = line.split()
-    for w in words:
-        records.append((w, 1))
-    return records
-
-
-# Returns the first attribute of a tuple
-def key_selector(tuple):
-    return tuple[0]
-
-
-# Returns the second attribute of a tuple
-def attribute_selector(tuple):
-    return tuple[1]
+    return [(word, 1) for word in line.split()]


 if __name__ == "__main__":
@@ -79,27 +65,23 @@ if __name__ == "__main__":
    args = parser.parse_args()
    titles_file = str(args.titles_file)

-    ray.init()
+    ray.init(load_code_from_local=True, include_java=True)

+    ctx = StreamingContext.Builder() \
+        .option(Config.CHANNEL_TYPE, Config.NATIVE_CHANNEL) \
+        .build()
    # A Ray streaming environment with the default configuration
-    env = Environment()
-    env.set_parallelism(2)  # Each operator will be executed by two actors
+    ctx.set_parallelism(1)  # Each operator will be executed by two actors

-    # The following dataflow is a simple streaming wordcount
-    #  with a rolling sum operator.
-    # It reads articles from wikipedia, splits them in words,
-    # shuffles words, and counts the occurences of each word.
-    stream = env.source(Wikipedia(titles_file)) \
-                .round_robin() \
-                .flat_map(splitter) \
-                .key_by(key_selector) \
-                .sum(attribute_selector) \
-                .inspect(print)     # Prints the contents of the
-    # stream to stdout
+    # Reads articles from wikipedia, splits them in words,
+    # shuffles words, and counts the occurrences of each word.
+    stream = ctx.source(Wikipedia(titles_file)) \
+        .flat_map(splitter) \
+        .key_by(lambda x: x[0]) \
+        .reduce(lambda old_value, new_value:
+                (old_value[0], old_value[1] + new_value[1])) \
+        .sink(print)
    start = time.time()
-    env_handle = env.execute()  # Deploys and executes the dataflow
-    ray.get(env_handle)  # Stay alive until execution finishes
-    env.wait_finish()
+    ctx.execute("wordcount")
    end = time.time()
    logger.info("Elapsed time: {} secs".format(end - start))
-    logger.debug("Output stream id: {}".format(stream.id))