[Streaming] Streaming Python API (#6755)

This commit is contained in:
chaokunyang
2020-02-25 10:33:33 +08:00
committed by GitHub
parent 2c1f4fd82c
commit 8b6784de06
71 changed files with 2701 additions and 1928 deletions
@@ -1,67 +0,0 @@
import argparse
import logging
import time
import ray
from ray.streaming.streaming import Environment
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument("--input-file", required=True, help="the input text file")
# A class used to check attribute-based key selection
class Record:
def __init__(self, record):
k, _ = record
self.word = k
self.record = record
# Splits input line into words and outputs objects of type Record
# each one consisting of a key (word) and a tuple (word,1)
def splitter(line):
records = []
words = line.split()
for w in words:
records.append(Record((w, 1)))
return records
# Receives an object of type Record and returns the actual tuple
def as_tuple(record):
return record.record
if __name__ == "__main__":
# Get program parameters
args = parser.parse_args()
input_file = str(args.input_file)
ray.init()
ray.register_custom_serializer(Record, use_dict=True)
# A Ray streaming environment with the default configuration
env = Environment()
env.set_parallelism(2) # Each operator will be executed by two actors
# 'key_by("word")' physically partitions the stream of records
# based on the hash value of the 'word' attribute (see Record class above)
# 'map(as_tuple)' maps a record of type Record into a tuple
# 'sum(1)' sums the 2nd element of the tuple, i.e. the word count
stream = env.read_text_file(input_file) \
.round_robin() \
.flat_map(splitter) \
.key_by("word") \
.map(as_tuple) \
.sum(1) \
.inspect(print) # Prints the content of the
# stream to stdout
start = time.time()
env_handle = env.execute() # Deploys and executes the dataflow
ray.get(env_handle) # Stay alive until execution finishes
end = time.time()
logger.info("Elapsed time: {} secs".format(end - start))
logger.debug("Output stream id: {}".format(stream.id))
-52
View File
@@ -1,52 +0,0 @@
import argparse
import logging
import time
import ray
from ray.streaming.config import Config
from ray.streaming.streaming import Environment, Conf
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument("--input-file", required=True, help="the input text file")
# Test functions
def splitter(line):
return line.split()
def filter_fn(word):
if "f" in word:
return True
return False
if __name__ == "__main__":
args = parser.parse_args()
ray.init(local_mode=False)
# A Ray streaming environment with the default configuration
env = Environment(config=Conf(channel_type=Config.NATIVE_CHANNEL))
# Stream represents the ouput of the filter and
# can be forked into other dataflows
stream = env.read_text_file(args.input_file) \
.shuffle() \
.flat_map(splitter) \
.set_parallelism(2) \
.filter(filter_fn) \
.set_parallelism(2) \
.inspect(lambda x: print("result", x)) # Prints the contents of the
# stream to stdout
start = time.time()
env_handle = env.execute()
ray.get(env_handle) # Stay alive until execution finishes
env.wait_finish()
end = time.time()
logger.info("Elapsed time: {} secs".format(end - start))
logger.debug("Output stream id: {}".format(stream.id))
-5
View File
@@ -1,5 +0,0 @@
This is
a test file
to test if example
works
fine
+17 -35
View File
@@ -4,7 +4,8 @@ import time
import ray
import wikipedia
from ray.streaming.streaming import Environment
from ray.streaming import StreamingContext
from ray.streaming.config import Config
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
@@ -23,7 +24,6 @@ class Wikipedia:
def __init__(self, title_file):
# Titles in this file will be as queries
self.title_file = title_file
# TODO (john): Handle possible exception here
self.title_reader = iter(list(open(self.title_file, "r").readlines()))
self.done = False
self.article_done = True
@@ -57,21 +57,7 @@ class Wikipedia:
# Splits input line into words and
# outputs records of the form (word,1)
def splitter(line):
records = []
words = line.split()
for w in words:
records.append((w, 1))
return records
# Returns the first attribute of a tuple
def key_selector(tuple):
return tuple[0]
# Returns the second attribute of a tuple
def attribute_selector(tuple):
return tuple[1]
return [(word, 1) for word in line.split()]
if __name__ == "__main__":
@@ -79,27 +65,23 @@ if __name__ == "__main__":
args = parser.parse_args()
titles_file = str(args.titles_file)
ray.init()
ray.init(load_code_from_local=True, include_java=True)
ctx = StreamingContext.Builder() \
.option(Config.CHANNEL_TYPE, Config.NATIVE_CHANNEL) \
.build()
# A Ray streaming environment with the default configuration
env = Environment()
env.set_parallelism(2) # Each operator will be executed by two actors
ctx.set_parallelism(1) # Each operator will be executed by two actors
# The following dataflow is a simple streaming wordcount
# with a rolling sum operator.
# It reads articles from wikipedia, splits them in words,
# shuffles words, and counts the occurences of each word.
stream = env.source(Wikipedia(titles_file)) \
.round_robin() \
.flat_map(splitter) \
.key_by(key_selector) \
.sum(attribute_selector) \
.inspect(print) # Prints the contents of the
# stream to stdout
# Reads articles from wikipedia, splits them in words,
# shuffles words, and counts the occurrences of each word.
stream = ctx.source(Wikipedia(titles_file)) \
.flat_map(splitter) \
.key_by(lambda x: x[0]) \
.reduce(lambda old_value, new_value:
(old_value[0], old_value[1] + new_value[1])) \
.sink(print)
start = time.time()
env_handle = env.execute() # Deploys and executes the dataflow
ray.get(env_handle) # Stay alive until execution finishes
env.wait_finish()
ctx.execute("wordcount")
end = time.time()
logger.info("Elapsed time: {} secs".format(end - start))
logger.debug("Output stream id: {}".format(stream.id))