diff --git a/doc/source/serve/advanced.rst b/doc/source/serve/advanced.rst index f8e4f21f3..25cd0e8f7 100644 --- a/doc/source/serve/advanced.rst +++ b/doc/source/serve/advanced.rst @@ -185,6 +185,28 @@ The shard key can either be specified via the X-SERVE-SHARD-KEY HTTP header or ` handle = serve.get_handle("api_endpoint") handler.options(shard_key=session_id).remote(args) +Composing Multiple Models +========================= +Ray Serve supports composing individually scalable models into a single model +out of the box. For instance, you can combine multiple models to perform +stacking or ensembles. + +To define a higher-level composed model you need to do three things: + +1. Define your underlying models (the ones that you will compose together) as + Ray Serve backends +2. Define your composed model, using the handles of the underlying models + (see the example below). +3. Define an endpoint representing this composed model and query it! + +In order to avoid synchronous execution in the composed model (e.g., it's very +slow to make calls to the composed model), you'll need to make the function +asynchronous by using an ``async def``. You'll see this in the example below. + +That's it. Let's take a look at an example: + +.. literalinclude:: ../../../python/ray/serve/examples/doc/snippet_model_composition.py + .. _serve-faq: diff --git a/doc/source/serve/key-concepts.rst b/doc/source/serve/key-concepts.rst index d28ecabf9..11d99d53e 100644 --- a/doc/source/serve/key-concepts.rst +++ b/doc/source/serve/key-concepts.rst @@ -11,6 +11,7 @@ To follow along, you'll need to make the necessary imports. from ray import serve serve.init() # Initializes Ray and Ray Serve. +.. _`serve-backend`: Backends ======== diff --git a/python/ray/serve/examples/doc/snippet_model_composition.py b/python/ray/serve/examples/doc/snippet_model_composition.py index 29bef42c7..1cf3f2c4c 100644 --- a/python/ray/serve/examples/doc/snippet_model_composition.py +++ b/python/ray/serve/examples/doc/snippet_model_composition.py @@ -1,11 +1,17 @@ from random import random - import requests - from ray import serve serve.init() +# Our pipeline will be structured as follows: +# - Input comes in, the composed model sends it to model_one +# - model_one outputs a random number between 0 and 1, if the value is +# greater than 0.5, then the data is sent to model_two +# - otherwise, the data is returned to the user. + +# Let's define two models that just print out the data they received. + def model_one(_unused_flask_request, data=None): print("Model 1 called with data ", data) @@ -22,6 +28,7 @@ class ComposedModel: self.model_one = serve.get_handle("model_one") self.model_two = serve.get_handle("model_two") + # This method can be called concurrently! async def __call__(self, flask_request): data = flask_request.data @@ -41,6 +48,8 @@ serve.create_endpoint("model_one", backend="model_one") serve.create_backend("model_two", model_two) serve.create_endpoint("model_two", backend="model_two") +# max_concurrent_queries is optional. By default, if you pass in an async +# function, Ray Serve sets the limit to a high number. serve.create_backend( "composed_backend", ComposedModel, config={"max_concurrent_queries": 10}) serve.create_endpoint(