mirror of
https://github.com/wassname/ray.git
synced 2026-07-02 04:59:12 +08:00
[Streaming] Streaming data transfer and python integration (#6185)
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
New York City
|
||||
Berlin
|
||||
London
|
||||
Paris
|
||||
United States
|
||||
Germany
|
||||
France
|
||||
United Kingdom
|
||||
@@ -0,0 +1,71 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.streaming.streaming import Environment
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input-file", required=True, help="the input text file")
|
||||
|
||||
|
||||
# A class used to check attribute-based key selection
|
||||
class Record(object):
|
||||
def __init__(self, record):
|
||||
k, _ = record
|
||||
self.word = k
|
||||
self.record = record
|
||||
|
||||
|
||||
# Splits input line into words and outputs objects of type Record
|
||||
# each one consisting of a key (word) and a tuple (word,1)
|
||||
def splitter(line):
|
||||
records = []
|
||||
words = line.split()
|
||||
for w in words:
|
||||
records.append(Record((w, 1)))
|
||||
return records
|
||||
|
||||
|
||||
# Receives an object of type Record and returns the actual tuple
|
||||
def as_tuple(record):
|
||||
return record.record
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Get program parameters
|
||||
args = parser.parse_args()
|
||||
input_file = str(args.input_file)
|
||||
|
||||
ray.init()
|
||||
ray.register_custom_serializer(Record, use_dict=True)
|
||||
|
||||
# A Ray streaming environment with the default configuration
|
||||
env = Environment()
|
||||
env.set_parallelism(2) # Each operator will be executed by two actors
|
||||
|
||||
# 'key_by("word")' physically partitions the stream of records
|
||||
# based on the hash value of the 'word' attribute (see Record class above)
|
||||
# 'map(as_tuple)' maps a record of type Record into a tuple
|
||||
# 'sum(1)' sums the 2nd element of the tuple, i.e. the word count
|
||||
stream = env.read_text_file(input_file) \
|
||||
.round_robin() \
|
||||
.flat_map(splitter) \
|
||||
.key_by("word") \
|
||||
.map(as_tuple) \
|
||||
.sum(1) \
|
||||
.inspect(print) # Prints the content of the
|
||||
# stream to stdout
|
||||
start = time.time()
|
||||
env_handle = env.execute() # Deploys and executes the dataflow
|
||||
ray.get(env_handle) # Stay alive until execution finishes
|
||||
end = time.time()
|
||||
logger.info("Elapsed time: {} secs".format(end - start))
|
||||
logger.debug("Output stream id: {}".format(stream.id))
|
||||
@@ -0,0 +1,56 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.streaming.config import Config
|
||||
from ray.streaming.streaming import Environment, Conf
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input-file", required=True, help="the input text file")
|
||||
|
||||
|
||||
# Test functions
|
||||
def splitter(line):
|
||||
return line.split()
|
||||
|
||||
|
||||
def filter_fn(word):
|
||||
if "f" in word:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
ray.init(local_mode=False)
|
||||
|
||||
# A Ray streaming environment with the default configuration
|
||||
env = Environment(config=Conf(channel_type=Config.NATIVE_CHANNEL))
|
||||
|
||||
# Stream represents the ouput of the filter and
|
||||
# can be forked into other dataflows
|
||||
stream = env.read_text_file(args.input_file) \
|
||||
.shuffle() \
|
||||
.flat_map(splitter) \
|
||||
.set_parallelism(2) \
|
||||
.filter(filter_fn) \
|
||||
.set_parallelism(2) \
|
||||
.inspect(lambda x: print("result", x)) # Prints the contents of the
|
||||
# stream to stdout
|
||||
start = time.time()
|
||||
env_handle = env.execute()
|
||||
ray.get(env_handle) # Stay alive until execution finishes
|
||||
env.wait_finish()
|
||||
end = time.time()
|
||||
logger.info("Elapsed time: {} secs".format(end - start))
|
||||
logger.debug("Output stream id: {}".format(stream.id))
|
||||
@@ -0,0 +1,5 @@
|
||||
This is
|
||||
a test file
|
||||
to test if example
|
||||
works
|
||||
fine
|
||||
@@ -0,0 +1,109 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
|
||||
import ray
|
||||
import wikipedia
|
||||
from ray.streaming.streaming import Environment
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--titles-file",
|
||||
required=True,
|
||||
help="the file containing the wikipedia titles to lookup")
|
||||
|
||||
|
||||
# A custom data source that reads articles from wikipedia
|
||||
# Custom data sources need to implement a get_next() method
|
||||
# that returns the next data element, in this case sentences
|
||||
class Wikipedia(object):
|
||||
def __init__(self, title_file):
|
||||
# Titles in this file will be as queries
|
||||
self.title_file = title_file
|
||||
# TODO (john): Handle possible exception here
|
||||
self.title_reader = iter(list(open(self.title_file, "r").readlines()))
|
||||
self.done = False
|
||||
self.article_done = True
|
||||
self.sentences = iter([])
|
||||
|
||||
# Returns next sentence from a wikipedia article
|
||||
def get_next(self):
|
||||
if self.done:
|
||||
return None # Source exhausted
|
||||
while True:
|
||||
if self.article_done:
|
||||
try: # Try next title
|
||||
next_title = next(self.title_reader)
|
||||
except StopIteration:
|
||||
self.done = True # Source exhausted
|
||||
return None
|
||||
# Get next article
|
||||
logger.debug("Next article: {}".format(next_title))
|
||||
article = wikipedia.page(next_title).content
|
||||
# Split article in sentences
|
||||
self.sentences = iter(article.split("."))
|
||||
self.article_done = False
|
||||
try: # Try next sentence
|
||||
sentence = next(self.sentences)
|
||||
logger.debug("Next sentence: {}".format(sentence))
|
||||
return sentence
|
||||
except StopIteration:
|
||||
self.article_done = True
|
||||
|
||||
|
||||
# Splits input line into words and
|
||||
# outputs records of the form (word,1)
|
||||
def splitter(line):
|
||||
records = []
|
||||
words = line.split()
|
||||
for w in words:
|
||||
records.append((w, 1))
|
||||
return records
|
||||
|
||||
|
||||
# Returns the first attribute of a tuple
|
||||
def key_selector(tuple):
|
||||
return tuple[0]
|
||||
|
||||
|
||||
# Returns the second attribute of a tuple
|
||||
def attribute_selector(tuple):
|
||||
return tuple[1]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Get program parameters
|
||||
args = parser.parse_args()
|
||||
titles_file = str(args.titles_file)
|
||||
|
||||
ray.init()
|
||||
|
||||
# A Ray streaming environment with the default configuration
|
||||
env = Environment()
|
||||
env.set_parallelism(2) # Each operator will be executed by two actors
|
||||
|
||||
# The following dataflow is a simple streaming wordcount
|
||||
# with a rolling sum operator.
|
||||
# It reads articles from wikipedia, splits them in words,
|
||||
# shuffles words, and counts the occurences of each word.
|
||||
stream = env.source(Wikipedia(titles_file)) \
|
||||
.round_robin() \
|
||||
.flat_map(splitter) \
|
||||
.key_by(key_selector) \
|
||||
.sum(attribute_selector) \
|
||||
.inspect(print) # Prints the contents of the
|
||||
# stream to stdout
|
||||
start = time.time()
|
||||
env_handle = env.execute() # Deploys and executes the dataflow
|
||||
ray.get(env_handle) # Stay alive until execution finishes
|
||||
env.wait_finish()
|
||||
end = time.time()
|
||||
logger.info("Elapsed time: {} secs".format(end - start))
|
||||
logger.debug("Output stream id: {}".format(stream.id))
|
||||
Reference in New Issue
Block a user