[Streaming] Streaming data transfer and python integration (#6185)

2026-07-02 04:59:12 +08:00 · 2019-12-10 20:33:24 +08:00
parent c1d4ab8bb4
commit 6272907a57
93 changed files with 8434 additions and 1480 deletions
@@ -0,0 +1,8 @@
+New York City
+Berlin
+London
+Paris
+United States
+Germany
+France
+United Kingdom
@@ -0,0 +1,71 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import logging
+import time
+
+import ray
+from ray.streaming.streaming import Environment
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input-file", required=True, help="the input text file")
+
+
+# A class used to check attribute-based key selection
+class Record(object):
+    def __init__(self, record):
+        k, _ = record
+        self.word = k
+        self.record = record
+
+
+# Splits input line into words and outputs objects of type Record
+# each one consisting of a key (word) and a tuple (word,1)
+def splitter(line):
+    records = []
+    words = line.split()
+    for w in words:
+        records.append(Record((w, 1)))
+    return records
+
+
+# Receives an object of type Record and returns the actual tuple
+def as_tuple(record):
+    return record.record
+
+
+if __name__ == "__main__":
+    # Get program parameters
+    args = parser.parse_args()
+    input_file = str(args.input_file)
+
+    ray.init()
+    ray.register_custom_serializer(Record, use_dict=True)
+
+    # A Ray streaming environment with the default configuration
+    env = Environment()
+    env.set_parallelism(2)  # Each operator will be executed by two actors
+
+    # 'key_by("word")' physically partitions the stream of records
+    # based on the hash value of the 'word' attribute (see Record class above)
+    # 'map(as_tuple)' maps a record of type Record into a tuple
+    # 'sum(1)' sums the 2nd element of the tuple, i.e. the word count
+    stream = env.read_text_file(input_file) \
+                .round_robin() \
+                .flat_map(splitter) \
+                .key_by("word") \
+                .map(as_tuple) \
+                .sum(1) \
+                .inspect(print)     # Prints the content of the
+    # stream to stdout
+    start = time.time()
+    env_handle = env.execute()  # Deploys and executes the dataflow
+    ray.get(env_handle)  # Stay alive until execution finishes
+    end = time.time()
+    logger.info("Elapsed time: {} secs".format(end - start))
+    logger.debug("Output stream id: {}".format(stream.id))
@@ -0,0 +1,56 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import logging
+import time
+
+import ray
+from ray.streaming.config import Config
+from ray.streaming.streaming import Environment, Conf
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input-file", required=True, help="the input text file")
+
+
+# Test functions
+def splitter(line):
+    return line.split()
+
+
+def filter_fn(word):
+    if "f" in word:
+        return True
+    return False
+
+
+if __name__ == "__main__":
+
+    args = parser.parse_args()
+
+    ray.init(local_mode=False)
+
+    # A Ray streaming environment with the default configuration
+    env = Environment(config=Conf(channel_type=Config.NATIVE_CHANNEL))
+
+    # Stream represents the ouput of the filter and
+    # can be forked into other dataflows
+    stream = env.read_text_file(args.input_file) \
+        .shuffle() \
+        .flat_map(splitter) \
+        .set_parallelism(2) \
+        .filter(filter_fn) \
+        .set_parallelism(2) \
+        .inspect(lambda x: print("result", x))     # Prints the contents of the
+    # stream to stdout
+    start = time.time()
+    env_handle = env.execute()
+    ray.get(env_handle)  # Stay alive until execution finishes
+    env.wait_finish()
+    end = time.time()
+    logger.info("Elapsed time: {} secs".format(end - start))
+    logger.debug("Output stream id: {}".format(stream.id))
@@ -0,0 +1,5 @@
+This is
+a test file
+to test if example
+works
+fine
@@ -0,0 +1,109 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import logging
+import time
+
+import ray
+import wikipedia
+from ray.streaming.streaming import Environment
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--titles-file",
+    required=True,
+    help="the file containing the wikipedia titles to lookup")
+
+
+# A custom data source that reads articles from wikipedia
+# Custom data sources need to implement a get_next() method
+# that returns the next data element, in this case sentences
+class Wikipedia(object):
+    def __init__(self, title_file):
+        # Titles in this file will be as queries
+        self.title_file = title_file
+        # TODO (john): Handle possible exception here
+        self.title_reader = iter(list(open(self.title_file, "r").readlines()))
+        self.done = False
+        self.article_done = True
+        self.sentences = iter([])
+
+    # Returns next sentence from a wikipedia article
+    def get_next(self):
+        if self.done:
+            return None  # Source exhausted
+        while True:
+            if self.article_done:
+                try:  # Try next title
+                    next_title = next(self.title_reader)
+                except StopIteration:
+                    self.done = True  # Source exhausted
+                    return None
+                # Get next article
+                logger.debug("Next article: {}".format(next_title))
+                article = wikipedia.page(next_title).content
+                # Split article in sentences
+                self.sentences = iter(article.split("."))
+                self.article_done = False
+            try:  # Try next sentence
+                sentence = next(self.sentences)
+                logger.debug("Next sentence: {}".format(sentence))
+                return sentence
+            except StopIteration:
+                self.article_done = True
+
+
+# Splits input line into words and
+# outputs records of the form (word,1)
+def splitter(line):
+    records = []
+    words = line.split()
+    for w in words:
+        records.append((w, 1))
+    return records
+
+
+# Returns the first attribute of a tuple
+def key_selector(tuple):
+    return tuple[0]
+
+
+# Returns the second attribute of a tuple
+def attribute_selector(tuple):
+    return tuple[1]
+
+
+if __name__ == "__main__":
+    # Get program parameters
+    args = parser.parse_args()
+    titles_file = str(args.titles_file)
+
+    ray.init()
+
+    # A Ray streaming environment with the default configuration
+    env = Environment()
+    env.set_parallelism(2)  # Each operator will be executed by two actors
+
+    # The following dataflow is a simple streaming wordcount
+    #  with a rolling sum operator.
+    # It reads articles from wikipedia, splits them in words,
+    # shuffles words, and counts the occurences of each word.
+    stream = env.source(Wikipedia(titles_file)) \
+                .round_robin() \
+                .flat_map(splitter) \
+                .key_by(key_selector) \
+                .sum(attribute_selector) \
+                .inspect(print)     # Prints the contents of the
+    # stream to stdout
+    start = time.time()
+    env_handle = env.execute()  # Deploys and executes the dataflow
+    ray.get(env_handle)  # Stay alive until execution finishes
+    env.wait_finish()
+    end = time.time()
+    logger.info("Elapsed time: {} secs".format(end - start))
+    logger.debug("Output stream id: {}".format(stream.id))