From fadd47e44eff41812c07af78fc7f5f0703c678e4 Mon Sep 17 00:00:00 2001 From: Bill Chambers Date: Wed, 27 May 2020 09:03:28 -0700 Subject: [PATCH] [docs] Ray Serve Documentation Overhaul (#8524) --- doc/source/index.rst | 9 +- doc/source/serve/advanced.rst | 149 +++++++++ doc/source/serve/deployment.rst | 171 ++++++++++ doc/source/serve/index.rst | 93 ++++++ doc/source/serve/key-concepts.rst | 84 +++++ doc/source/serve/overview.rst | 306 ------------------ doc/source/serve/tutorials/index.rst | 19 ++ .../{pytorch-tutorial.rst => pytorch.rst} | 4 +- .../{sklearn-tutorial.rst => sklearn.rst} | 4 +- ...tensorflow-tutorial.rst => tensorflow.rst} | 4 +- python/ray/serve/BUILD | 9 + .../ray/serve/examples/doc/tutorial_deploy.py | 169 ++++++++++ 12 files changed, 705 insertions(+), 316 deletions(-) create mode 100644 doc/source/serve/advanced.rst create mode 100644 doc/source/serve/deployment.rst create mode 100644 doc/source/serve/index.rst create mode 100644 doc/source/serve/key-concepts.rst delete mode 100644 doc/source/serve/overview.rst create mode 100644 doc/source/serve/tutorials/index.rst rename doc/source/serve/tutorials/{pytorch-tutorial.rst => pytorch.rst} (89%) rename doc/source/serve/tutorials/{sklearn-tutorial.rst => sklearn.rst} (89%) rename doc/source/serve/tutorials/{tensorflow-tutorial.rst => tensorflow.rst} (90%) create mode 100644 python/ray/serve/examples/doc/tutorial_deploy.py diff --git a/doc/source/index.rst b/doc/source/index.rst index a3aefa20f..0789c7ddc 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -187,10 +187,11 @@ Getting Involved :maxdepth: -1 :caption: Ray Serve - serve/overview.rst - serve/tutorials/tensorflow-tutorial.rst - serve/tutorials/pytorch-tutorial.rst - serve/tutorials/sklearn-tutorial.rst + serve/index.rst + serve/key-concepts.rst + serve/tutorials/index.rst + serve/deployment.rst + serve/advanced.rst .. toctree:: :maxdepth: -1 diff --git a/doc/source/serve/advanced.rst b/doc/source/serve/advanced.rst new file mode 100644 index 000000000..2081e2df6 --- /dev/null +++ b/doc/source/serve/advanced.rst @@ -0,0 +1,149 @@ +====================================== +Advanced Topics, Configurations, & FAQ +====================================== + +Ray Serve has a number of knobs and tools for you to tune for your particular workload. +All Ray Serve advanced options and topics are covered on this page aside from the +fundamentals of :doc:`deployment`. For a more hands on take, please check out the :ref:`serve-tutorials`. + +There are a number of things you'll likely want to do with your serving application including +scaling out, splitting traffic, or batching input for better performance. To do all of this, +you will create a ``BackendConfig``, a configuration object that you'll use to set +the properties of a particular backend. + +.. contents:: + +Scaling Out +=========== + +To scale out a backend to multiple workers, simplify configure the number of replicas. + +.. code-block:: python + + config = {"num_replicas": 10} + serve.create_backend("my_scaled_endpoint_backend", handle_request, config=config) + + # scale it back down... + config = {"num_replicas": 2} + serve.set_backend_config("my_scaled_endpoint_backend", handle_request, config=config) + +This will scale up or down the number of workers that can accept requests. + +Using Resources (CPUs, GPUs) +============================ + +To assign hardware resource per worker, you can pass resource requirements to +``ray_actor_options``. To learn about options to pass in, take a look at +:ref:`Resources with Actor` guide. + +For example, to create a backend where each replica uses a single GPU, you can do the +following: + +.. code-block:: python + + config = {"num_gpus": 1} + serve.create_backend("my_gpu_backend", handle_request, ray_actor_options=config) + +.. note:: + + Deep learning models like PyTorch and Tensorflow often use all the CPUs when + performing inference. Ray sets the environment variable ``OMP_NUM_THREADS=1`` to + :ref:`avoid contention`. This means each worker will only + use one CPU instead of all of them. + +.. _serve-batching: + +Batching to improve performance +=============================== + +You can also have Ray Serve batch requests for performance. In order to do use this feature, you need to: +1. Set the `max_batch_size` in the `BackendConfig`. +2. Modify your backend implementation to accept a list of requests and return a list of responses instead of handling a single request. + + +.. code-block:: python + + class BatchingExample: + def __init__(self): + self.count = 0 + + @serve.accept_batch + def __call__(self, requests): + responses = [] + for request in requests: + responses.append(request.json()) + return responses + + serve.create_endpoint("counter1", "/increment") + + config = {"max_batch_size": 5} + serve.create_backend("counter1", BatchingExample, config=config) + serve.set_traffic("counter1", {"counter1": 1.0}) + +.. _`serve-split-traffic`: + +Splitting Traffic and A/B Testing +================================== + +It's trivial to also split traffic, simply specify the endpoint and the backends that you want to split. + +.. code-block:: python + + serve.create_endpoint("endpoint_identifier_split", "/split", methods=["GET", "POST"]) + + # splitting traffic 70/30 + serve.set_traffic("endpoint_identifier_split", {"my_endpoint_backend": 0.7, "my_endpoint_backend_class": 0.3}) + +While splitting traffic is general simple, at times you'll want to consider :ref:`session-affinity`, making it easy to +control what users see which version of the model. See the docs on :ref:`session-affinity` for more information. + +.. _session-affinity: + +Session Affinity +================ + +In some cases, you may want to ensure that requests from the same client, user, etc. get mapped to the same backend. +To do this, you can specify a "shard key" that will deterministically map requests to a backend. +The shard key can either be specified via the X-SERVE-SHARD-KEY HTTP header or ``handle.options(shard_key="key")``. + +.. note:: The mapping from shard key to backend may change when you update the traffic policy for an endpoint. + +.. code-block:: python + + # Specifying the shard key via an HTTP header. + requests.get("127.0.0.1:8000/api", headers={"X-SERVE-SHARD-KEY": session_id}) + + # Specifying the shard key in a call made via serve handle. + handle = serve.get_handle("api_endpoint") + handler.options(shard_key=session_id).remote(args) + + +.. _serve-faq: + +Ray Serve FAQ +============= + +How do I deploy serve? +---------------------- + +See :doc:`deployment` for information about how to deploy serve. + +How do I delete backends and endpoints? +--------------------------------------- + +To delete a backend, you can use `serve.delete_backend`. +Note that the backend must not be use by any endpoints in order to be delete. +Once a backend is deleted, its tag can be reused. + +.. code-block:: python + + serve.delete_backend("simple_backend") + + +To delete a endpoint, you can use `serve.delete_endpoint`. +Note that the endpoint will no longer work and return a 404 when queried. +Once a endpoint is deleted, its tag can be reused. + +.. code-block:: python + + serve.delete_endpoint("simple_endpoint") diff --git a/doc/source/serve/deployment.rst b/doc/source/serve/deployment.rst new file mode 100644 index 000000000..683af72ed --- /dev/null +++ b/doc/source/serve/deployment.rst @@ -0,0 +1,171 @@ +=================== +Deploying Ray Serve +=================== + +In the :doc:`key-concepts`, you saw some of the basics of how to write serve applications. +This section will dive a bit deeper into how Ray Serve runs on a Ray cluster and how you're able +to deploy and update your serve application over time. + +To deploy a Ray Serve application (and cluster) you're going to need several things. + +1. A running Ray cluster (you can deploy one on your local machine for testing). +2. A Ray Serve cluster To learn more about Ray clusters see :doc:`../cluster-index`. +3. Your Ray Serve endpoint(s) and backend(s). + +.. contents:: Deploying Ray Serve + +.. _serve-deploy-tutorial: + +Deploying a Model with Ray Serve +================================ + +Let's get started deploying our first Ray Serve application. The first thing you'll need +to do is start a Ray cluster. You can do that using the Ray autoscaler, but in our case +we'll create it on our local machine. To learn more about Ray Clusters see :doc:`../cluster-index`. + +Starting the Cluster +-------------------- +We do that by running: + +.. code:: + + ray start --head + +That starts a cluster on our local machine. We can shut that down by running ``ray stop``. You should +run this after we complete this tutorial. + +Setup: Training a Model +----------------------- + +Make sure you install `Scikit-learn `_. + +Place the following in a python script and run it. In this example we're training +a model and saving it to disk for us to load into our Ray Serve app. + +.. literalinclude:: ../../../python/ray/serve/examples/doc/tutorial_deploy.py + :start-after: __doc_import_train_begin__ + :end-before: __doc_import_train_end__ + +As discussed in other :doc:`tutorials/index`, we can use any framework to build these models. In general, +you'll just want to have the ability to persist these models to disk. + +Now that we've trained that model and saved it to disk (keep in mind this could also be a service like S3), +we'll need to create a backend to serve the model. + +Creating a Model and Serving it +------------------------------- + +In the following snippet we will complete two things: +1. Define a servable model by instantiating a class and defining the ``__call__`` method. +2. Connect to our running Ray cluster(``ray.init(...)``) and then start or connect to the Ray Serve service +on that cluster(``serve.init(...)``). + + +You can see that defining the model is straightforward and simple, we're simply instantiating +the model like we might a typical Python class. + +Configuring our model to accept traffic is specified via ``.set_traffic`` after we created +a backend in serve for our model (and versioned it with a string). + +.. literalinclude:: ../../../python/ray/serve/examples/doc/tutorial_deploy.py + :start-after: __doc_create_deploy_begin__ + :end-before: __doc_create_deploy_end__ + +What serve does when we run this code is store the model as a Ray actor +and route traffic to it as the endpoint is queried, in this case over HTTP. + +Let's now query our endpoint to see the result. + +Querying our Endpoint +--------------------- + +We'll use the requests library to query our endpoint and be able to get a result. + +.. literalinclude:: ../../../python/ray/serve/examples/doc/tutorial_deploy.py + :start-after: __doc_query_begin__ + :end-before: __doc_query_end__ + + +Now that we defined a model and have it running on our Ray cluster. Let's proceed with updating +this model with a new set of code. + +Updating Your Model Over Time +============================= + +Updating our model is as simple as deploying the first one. While the code snippet includes +a lot of information, all that we're doing is we are defining a new model, saving it, then loading +it into serve. The key lines are at the end. + +.. literalinclude:: ../../../python/ray/serve/examples/doc/tutorial_deploy.py + :start-after: __doc_create_deploy_2_begin__ + :end-before: __doc_create_deploy_2_end__ + +Consequentially, since Ray Serve runs as a service, all we need to tell it is that (a) there's a new model +and (b) how much traffic we should send to that model (and from what endpoint). + +We do that with the line at the end of the code snippet, which allows us to split traffic between +these two models. + +.. code:: + + serve.set_traffic("iris_classifier", {"lr:v2": 0.25, "lr:v1": 0.75}) + +While this is a simple operation, you may want to see :ref:`serve-split-traffic` for more information. +One thing you may want to consider as well is +:ref:`session-affinity` which gives you the ability to ensure that queries from users/clients always get mapped to the same backend. +versions. + +Now that we're up and running serving two models in production, let's query +our results several times to see some results. You'll notice that we're now splitting +traffic between these two different models. + +Querying our Endpoint +--------------------- + +We'll use the requests library to query our endpoint and be able to get a result. + +.. literalinclude:: ../../../python/ray/serve/examples/doc/tutorial_deploy.py + :start-after: __doc_query_begin__ + :end-before: __doc_query_end__ + +If you run this code several times, you'll notice that the output will change - this +is due to us running the two models in parallel that we created above. + +Upon concluding the above tutorial, you'll want to run ``ray stop`` to +shutdown the Ray cluster on your local machine. + +Deployment FAQ +============== + +Best practices for local development +------------------------------------ + +One thing you may notice is that we never have to declare a ``while True`` loop or +something to keep the Ray Serve process running. In general, we don't recommend using forever loops and therefore +opt for launching a Ray Cluster locally. Specify a Ray cluster like we did in :ref:`serve-deploy-tutorial`. +To learn more, in general, about Ray Clusters see :doc:`../cluster-index`. + + +Deploying Multiple Serve Clusters on a Single Ray Cluster +--------------------------------------------------------- + +You can run multiple serve clusters on the same Ray cluster by providing a ``cluster_name`` to ``serve.init()``. + +.. code-block:: python + + # Create a first cluster whose HTTP server listens on 8000. + serve.init(cluster_name="cluster1", http_port=8000) + serve.create_endpoint("counter1", "/increment") + + # Create a second cluster whose HTTP server listens on 8001. + serve.init(cluster_name="cluster2", http_port=8001) + serve.create_endpoint("counter1", "/increment") + + # Create a backend that will be served on the second cluster. + serve.create_backend("counter1", function) + serve.set_traffic("counter1", {"counter1": 1.0}) + + # Switch back the the first cluster and create the same backend on it. + serve.init(cluster_name="cluster1") + serve.create_backend("counter1", function) + serve.set_traffic("counter1", {"counter1": 1.0}) diff --git a/doc/source/serve/index.rst b/doc/source/serve/index.rst new file mode 100644 index 000000000..17d470c62 --- /dev/null +++ b/doc/source/serve/index.rst @@ -0,0 +1,93 @@ +.. _rayserve: + +============================================ +Ray Serve: Scalable and Programmable Serving +============================================ + +.. image:: logo.svg + :align: center + :height: 250px + :width: 400px + +.. _rayserve-overview: + +Ray Serve is a scalable model-serving library built on Ray. + +For users, Ray Serve is: + +- **Framework Agnostic**:Use the same toolkit to serve everything from deep learning models + built with frameworks like :ref:`PyTorch ` or + :ref:`Tensorflow & Keras ` to :ref:`Scikit-Learn ` models or arbitrary business logic. +- **Python First**: Configure your model serving with pure Python code - no more YAMLs or + JSON configs. + +As a library, Ray Serve enables: + +- :ref:`serve-split-traffic` with zero downtime by decoupling routing logic from response handling logic. +- :ref:`serve-batching` built-in to help you meet your performance objectives or use your model for batch and online processing. + +Since Ray is built on Ray, Ray Serve also allows you to **scale to many machines** +and allows you to leverage all of the other Ray frameworks so you can deploy and scale on any cloud. + +.. note:: + If you want to try out Serve, join our `community slack `_ + and discuss in the #serve channel. + + +Installation +============ + +Ray Serve supports Python versions 3.5 and higher. To install Ray Serve: + +.. code-block:: bash + + pip install "ray[serve]" + +Ray Serve in 90 Seconds +======================= + +Serve a function by defining a function, an endpoint, and a backend (in this case a stateless function) then +connecting the two by setting traffic from the endpoint to the backend. + +.. literalinclude:: ../../../python/ray/serve/examples/doc/quickstart_function.py + +Serve a stateful class by defining a class (``Counter``), creating an endpoint and a backend, then connecting +the two by setting traffic from the endpoint to the backend. + +.. literalinclude:: ../../../python/ray/serve/examples/doc/quickstart_class.py + +See :doc:`key-concepts` for more exhaustive coverage about Ray Serve and its core concepts. + +Why Ray Serve? +============== + +There are generally two ways of serving machine learning applications, both with serious limitations: +you can build using a **traditional webserver** - your own Flask app or you can use a cloud hosted solution. + +The first approach is easy to get started with, but it's hard to scale each component. The second approach +requires vendor lock-in (SageMaker), framework specific tooling (TFServing), and a general +lack of flexibility. + +Ray Serve solves these problems by giving a user the ability to leverage the simplicity +of deployment of a simple webserver but handles the complex routing, scaling, and testing logic +necessary for production deployments. + +For more on the motivation behind Ray Serve, check out these `meetup slides `_. + +When should I use Ray Serve? +---------------------------- + +Ray Serve is a simple (but flexible) tool for deploying, operating, and monitoring Python based machine learning models. +Ray Serve excels when scaling out to serve models in production is a necessity. This might be because of large scale batch processing +requirements or because you're going to serve a number of models behind different endpoints and may need to run A/B tests or control +traffic between different models. + +If you plan on running on multiple machines, Ray Serve will serve you well. + +What's next? +============ + +Check out the :doc:`key-concepts`, learn more about :doc:`advanced`, look at the :ref:`serve-faq`, +or head over to the :doc:`tutorials/index` to get started building your Ray Serve Applications. + + diff --git a/doc/source/serve/key-concepts.rst b/doc/source/serve/key-concepts.rst new file mode 100644 index 000000000..e8a858238 --- /dev/null +++ b/doc/source/serve/key-concepts.rst @@ -0,0 +1,84 @@ +============ +Key Concepts +============ + +Ray Serve focuses on **simplicity** and only has two core concepts: endpoints and backends. + +To follow along, you'll need to make the necessary imports. + +.. code-block:: python + + from ray import serve + serve.init() # Initializes Ray and Ray Serve. + +.. _serve-endpoint: + +Endpoints +========= + +Endpoints allow you to name the "entity" that you'll be exposing, +the HTTP path that your application will expose. +Endpoints are "logical" and decoupled from the business logic or +model that you'll be serving. To create one, we'll simply specify the name, route, and methods. + +.. code-block:: python + + serve.create_endpoint("simple_endpoint", "/simple") + +You can also delete an endpoint using `serve.delete_endpoint`. +Note that this will not delete any associated backends, which can be reused for other endpoints. + +.. code-block:: python + + serve.delete_endpoint("simple_endpoint") + +.. _serve-backend: + +Backends +======== + +Backends are the logical structures for your business logic or models and +how you specify what should happen when an endpoint is queried. +To define a backend, first you must define the "handler" or the business logic you'd like to respond with. +The input to this request will be a `Flask Request object `_. +Use a function when your response is stateless and a class when you +might need to maintain some state (like a model). +For both functions and classes (that take as input Flask Requests), you'll need to +define them as backends to Ray Serve. + +It's important to note that Ray Serve places these backends in individual worker processes, which are replicas of the model. + +.. code-block:: python + + def handle_request(flask_request): + return "hello world" + + class RequestHandler: + def __init__(self): + self.msg = "hello, world!" + + def __call__(self, flask_request): + return self.msg + + serve.create_backend("simple_backend", handle_request) + serve.create_backend("simple_backend_class", RequestHandler) + +Setting Traffic +=============== + +Lastly, we need to route traffic the particular backend to the server endpoint. +To do that we'll use the ``set_traffic`` capability. +A link is essentially a load-balancer and allow you to define queuing policies +for how you would like backends to be served via an endpoint. +For instance, you can route 50% of traffic to Model A and 50% of traffic to Model B. + +.. code-block:: python + + serve.set_traffic("simple_backend", {"simple_endpoint": 1.0}) + +Once we've done that, we can now query our endpoint via HTTP (we use `requests` to make HTTP calls here). + +.. code-block:: python + + import requests + print(requests.get("http://127.0.0.1:8000/-/routes", timeout=0.5).text) diff --git a/doc/source/serve/overview.rst b/doc/source/serve/overview.rst deleted file mode 100644 index 62998ed5a..000000000 --- a/doc/source/serve/overview.rst +++ /dev/null @@ -1,306 +0,0 @@ -.. _rayserve: - -Ray Serve: Scalable and Programmable Serving -============================================ - -.. image:: logo.svg - :align: center - :height: 250px - :width: 400px - -.. _rayserve-overview: - -Overview --------- - -Ray Serve is a scalable model-serving library built on Ray. - -For users Ray Serve is: - -- **Framework Agnostic**:Use the same toolkit to serve everything from deep learning models - built with frameworks like PyTorch or TensorFlow to scikit-learn models or arbitrary business logic. -- **Python First**: Configure your model serving with pure Python code - no more YAMLs or - JSON configs. - -Ray Serve enables: - -- **A/B test models** with zero downtime by decoupling routing logic from response handling logic. -- **Batching** built-in to help you meet your performance objectives. - -Since Ray is built on Ray, Ray Serve also allows you to **scale to many machines** -and allows you to leverage all of the other Ray frameworks so you can deploy and scale on any cloud. - -.. note:: - If you want to try out Serve, join our `community slack `_ - and discuss in the #serve channel. - - -Installation -~~~~~~~~~~~~ -Ray Serve supports Python versions 3.5 and higher. To install Ray Serve: - -.. code-block:: bash - - pip install "ray[serve]" - - - -Ray Serve in 90 Seconds -~~~~~~~~~~~~~~~~~~~~~~~ - -Serve a stateless function: - -.. literalinclude:: ../../../python/ray/serve/examples/doc/quickstart_function.py - -Serve a stateful class: - -.. literalinclude:: ../../../python/ray/serve/examples/doc/quickstart_class.py - -See :ref:`serve-key-concepts` for more information about working with Ray Serve. - -Why Ray Serve? -~~~~~~~~~~~~~~ - -There are generally two ways of serving machine learning applications, both with serious limitations: -you can build using a **traditional webserver** - your own Flask app or you can use a cloud hosted solution. - -The first approach is easy to get started with, but it's hard to scale each component. The second approach -requires vendor lock-in (SageMaker), framework specific tooling (TFServing), and a general -lack of flexibility. - -Ray Serve solves these problems by giving a user the ability to leverage the simplicity -of deployment of a simple webserver but handles the complex routing, scaling, and testing logic -necessary for production deployments. - -For more on the motivation behind Ray Serve, check out these `meetup slides `_. - -When should I use Ray Serve? -++++++++++++++++++++++++++++ - -Ray Serve should be used when you need to deploy at least one model, preferrably many models. -Ray Serve **won't work well** when you need to run batch prediction over a dataset. Given this use case, we recommend looking into `multiprocessing with Ray `_. - -.. _serve-key-concepts: - -Key Concepts ------------- - -Ray Serve focuses on **simplicity** and only has two core concepts: endpoints and backends. - -To follow along, you'll need to make the necessary imports. - -.. code-block:: python - - from ray import serve - serve.init() # initializes serve and Ray - -.. _serve-endpoint: - -Endpoints -~~~~~~~~~ - -Endpoints allow you to name the "entity" that you'll be exposing, -the HTTP path that your application will expose. -Endpoints are "logical" and decoupled from the business logic or -model that you'll be serving. To create one, we'll simply specify the name, route, and methods. - -.. code-block:: python - - serve.create_endpoint("simple_endpoint", "/simple") - -You can also delete an endpoint using `serve.delete_endpoint`. -Note that this will not delete any associated backends, which can be reused for other endpoints. - -.. code-block:: python - - serve.delete_endpoint("simple_endpoint") - -.. _serve-backend: - -Backends -~~~~~~~~ - -Backends are the logical structures for your business logic or models and -how you specify what should happen when an endpoint is queried. -To define a backend, first you must define the "handler" or the business logic you'd like to respond with. -The input to this request will be a `Flask Request object `_. -Once you define the function (or class) that will handle a request. -You'd use a function when your response is stateless and a class when you -might need to maintain some state (like a model). -For both functions and classes (that take as input Flask Requests), you'll need to -define them as backends to Ray Serve. - -It's important to note that Ray Serve places these backends in individual workers, which are replicas of the model. - -.. code-block:: python - - def handle_request(flask_request): - return "hello world" - - class RequestHandler: - def __init__(self): - self.msg = "hello, world!" - - def __call__(self, flask_request): - return self.msg - - serve.create_backend("simple_backend", handle_request) - serve.create_backend("simple_backend_class", RequestHandler) - -Lastly, we need to link the particular backend to the server endpoint. -To do that we'll use the ``link`` capability. -A link is essentially a load-balancer and allow you to define queuing policies -for how you would like backends to be served via an endpoint. -For instance, you can route 50% of traffic to Model A and 50% of traffic to Model B. - -.. code-block:: python - - serve.set_traffic("simple_backend", {"simple_endpoint": 1.0}) - -Once we've done that, we can now query our endpoint via HTTP (we use `requests` to make HTTP calls here). - -.. code-block:: python - - import requests - print(requests.get("http://127.0.0.1:8000/-/routes", timeout=0.5).text) - -To delete a backend, we can use `serve.delete_backend`. -Note that the backend must not be use by any endpoints in order to be delete. -Once a backend is deleted, its tag can be reused. - -.. code-block:: python - - serve.delete_backend("simple_backend") - -Configuring Backends -~~~~~~~~~~~~~~~~~~~~ - -There are a number of things you'll likely want to do with your serving application including -scaling out, splitting traffic, or batching input for better response performance. To do all of this, -you will create a ``BackendConfig``, a configuration object that you'll use to set -the properties of a particular backend. - -Scaling Out -+++++++++++ - -To scale out a backend to multiple workers, simplify configure the number of replicas. - -.. code-block:: python - - config = {"num_replicas": 2} - serve.create_backend("my_scaled_endpoint_backend", handle_request, config=config) - -This will scale out the number of workers that can accept requests. - -Using Resources (CPUs, GPUs) -++++++++++++++++++++++++++++ -To assign hardware resource per worker, you can pass resource requirements to -``ray_actor_options``. To learn about options to pass in, take a look at -:ref:`Resources with Actor` guide. - -For example, to create a backend where each replica uses a single GPU, you can do the -following: - -.. code-block:: python - - options = {"num_gpus": 1} - serve.create_backend("my_gpu_backend", handle_request, ray_actor_options=options) - -.. note:: - - Deep learning models like PyTorch and Tensorflow often use all the CPUs when - performing inference. Ray sets the environment variable ``OMP_NUM_THREADS=1`` to - :ref:`avoid contention`. This means each worker will only - use one CPU instead of all of them. - -Splitting Traffic -+++++++++++++++++ - -It's trivial to also split traffic, simply specify the endpoint and the backends that you want to split. - -.. code-block:: python - - serve.create_endpoint("endpoint_identifier_split", "/split", methods=["GET", "POST"]) - - # splitting traffic 70/30 - serve.set_traffic("endpoint_identifier_split", {"my_endpoint_backend": 0.7, "my_endpoint_backend_class": 0.3}) - - -Batching -++++++++ - -You can also have Ray Serve batch requests for performance. You'll configure this in the backend config. - -.. code-block:: python - - class BatchingExample: - def __init__(self): - self.count = 0 - - @serve.accept_batch - def __call__(self, flask_request): - self.count += 1 - batch_size = serve.context.batch_size - return [self.count] * batch_size - - serve.create_endpoint("counter1", "/increment") - - config = {"max_batch_size": 5} - serve.create_backend("counter1", BatchingExample, config=config) - serve.set_traffic("counter1", {"counter1": 1.0}) - -Session Affinity -++++++++++++++++ - -In some cases, you may want to ensure that requests from the same client, user, etc. get mapped to the same backend. -To do this, you can specify a "shard key" that will deterministically map requests to a backend. -The shard key can either be specified via the X-SERVE-SHARD-KEY HTTP header or ``handle.options(shard_key="key")``. - -.. note:: The mapping from shard key to backend may change when you update the traffic policy for an endpoint. - -.. code-block:: python - - # Specifying the shard key via an HTTP header. - requests.get("127.0.0.1:8000/api", headers={"X-SERVE-SHARD-KEY": session_id}) - - # Specifying the shard key in a call made via serve handle. - handle = serve.get_handle("api_endpoint") - handler.options(shard_key=session_id).remote(args) - -Running Multiple Serve Clusters on one Ray Cluster -++++++++++++++++++++++++++++++++++++++++++++++++++ - -You can run multiple serve clusters on the same Ray cluster by providing a ``cluster_name`` to ``serve.init()``. - -.. code-block:: python - - # Create a first cluster whose HTTP server listens on 8000. - serve.init(cluster_name="cluster1", http_port=8000) - serve.create_endpoint("counter1", "/increment") - - # Create a second cluster whose HTTP server listens on 8001. - serve.init(cluster_name="cluster2", http_port=8001) - serve.create_endpoint("counter1", "/increment") - - # Create a backend that will be served on the second cluster. - serve.create_backend("counter1", function) - serve.set_traffic("counter1", {"counter1": 1.0}) - - # Switch back the the first cluster and create the same backend on it. - serve.init(cluster_name="cluster1") - serve.create_backend("counter1", function) - serve.set_traffic("counter1", {"counter1": 1.0}) - -Other Resources ---------------- - -.. _serve_frameworks: - -Frameworks -~~~~~~~~~~ -Ray Serve makes it easy to deploy models from all popular frameworks. -Learn more about how to deploy your model in the following tutorials: - -- :ref:`Tensorflow & Keras ` -- :ref:`PyTorch ` -- :ref:`Scikit-Learn ` diff --git a/doc/source/serve/tutorials/index.rst b/doc/source/serve/tutorials/index.rst new file mode 100644 index 000000000..20d62dbee --- /dev/null +++ b/doc/source/serve/tutorials/index.rst @@ -0,0 +1,19 @@ +========= +Tutorials +========= + +Below are a list of tutorials that you can use to learn more about the different pieces of +Ray Serve functionality and how to integrate different modeling frameworks. + +.. toctree:: + :caption: Serve Tutorials + :name: serve-tutorials + :maxdepth: -1 + + tensorflow.rst + pytorch.rst + sklearn.rst + + +Other Topics: +- :doc:`../deployment` \ No newline at end of file diff --git a/doc/source/serve/tutorials/pytorch-tutorial.rst b/doc/source/serve/tutorials/pytorch.rst similarity index 89% rename from doc/source/serve/tutorials/pytorch-tutorial.rst rename to doc/source/serve/tutorials/pytorch.rst index fb2d2cd2c..214637557 100644 --- a/doc/source/serve/tutorials/pytorch-tutorial.rst +++ b/doc/source/serve/tutorials/pytorch.rst @@ -9,10 +9,10 @@ In particular, we show: - How to load the model from PyTorch's pre-trained modelzoo. - How to parse the JSON request, transform the payload and evaluated in the model. -Please see the :ref:`overview ` to learn more general information about Ray Serve. +Please see the :doc:`../key-concepts` to learn more general information about Ray Serve. This tutorial requires Pytorch and Torchvision installed in your system. Ray Serve -is :ref:`framework agnostic ` and work with any version of PyTorch. +is framework agnostic and work with any version of PyTorch. .. code-block:: bash diff --git a/doc/source/serve/tutorials/sklearn-tutorial.rst b/doc/source/serve/tutorials/sklearn.rst similarity index 89% rename from doc/source/serve/tutorials/sklearn-tutorial.rst rename to doc/source/serve/tutorials/sklearn.rst index 3e0d5f745..752a49d34 100644 --- a/doc/source/serve/tutorials/sklearn-tutorial.rst +++ b/doc/source/serve/tutorials/sklearn.rst @@ -9,9 +9,9 @@ In particular, we show: - How to load the model from file system in your Ray Serve definition - How to parse the JSON request and evaluated in sklearn model -Please see the :ref:`overview ` to learn more general information about Ray Serve. +Please see the :doc:`../key-concepts` to learn more general information about Ray Serve. -Ray Serve supports :ref:`arbitrary frameworks `. You can use any version of sklearn. +Ray Serve is framework agnostic. You can use any version of sklearn. .. code-block:: bash diff --git a/doc/source/serve/tutorials/tensorflow-tutorial.rst b/doc/source/serve/tutorials/tensorflow.rst similarity index 90% rename from doc/source/serve/tutorials/tensorflow-tutorial.rst rename to doc/source/serve/tutorials/tensorflow.rst index e63b9a1f4..73bc577dc 100644 --- a/doc/source/serve/tutorials/tensorflow-tutorial.rst +++ b/doc/source/serve/tutorials/tensorflow.rst @@ -9,9 +9,9 @@ In particular, we show: - How to load the model from file system in your Ray Serve definition - How to parse the JSON request and evaluated in Tensorflow -Please see the :ref:`overview ` to learn more general information about Ray Serve. +Please see the :doc:`../key-concepts` to learn more general information about Ray Serve. -Ray Serve makes it easy to deploy models from :ref:`all popular frameworks `. +Ray Serve is framework agnostic you can use any version of Tensorflow. However, for this tutorial, we use Tensorflow 2 and Keras. Please make sure you have Tensorflow 2 installed. diff --git a/python/ray/serve/BUILD b/python/ray/serve/BUILD index 124a6a7b4..e680e3561 100644 --- a/python/ray/serve/BUILD +++ b/python/ray/serve/BUILD @@ -88,3 +88,12 @@ py_test( tags = ["exclusive"], deps = [":serve_lib"] ) + +py_test( + name = "tutorial_deploy", + size = "small", + srcs = glob(["examples/doc/*.py"]), + tags = ["exclusive"], + deps = [":serve_lib"] +) + diff --git a/python/ray/serve/examples/doc/tutorial_deploy.py b/python/ray/serve/examples/doc/tutorial_deploy.py new file mode 100644 index 000000000..d074ac597 --- /dev/null +++ b/python/ray/serve/examples/doc/tutorial_deploy.py @@ -0,0 +1,169 @@ +# yapf: disable +# __doc_import_train_begin__ +import pickle +import json +import numpy as np + +from sklearn.datasets import load_iris +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.metrics import mean_squared_error + +# Load data +iris_dataset = load_iris() +data, target, target_names = iris_dataset["data"], iris_dataset[ + "target"], iris_dataset["target_names"] + +# Instantiate model +model = GradientBoostingClassifier() + +# Training and validation split +np.random.shuffle(data), np.random.shuffle(target) +train_x, train_y = data[:100], target[:100] +val_x, val_y = data[100:], target[100:] + +# Train and evaluate models +model.fit(train_x, train_y) +print("MSE:", mean_squared_error(model.predict(val_x), val_y)) + +# Save the model and label to file +with open("/tmp/iris_model_logistic_regression.pkl", "wb") as f: + pickle.dump(model, f) +with open("/tmp/iris_labels.json", "w") as f: + json.dump(target_names.tolist(), f) +# __doc_import_train_end__ + + +# __doc_create_deploy_begin__ +import pickle # noqa: E402 +import json # noqa: E402 + +from ray import serve # noqa: E402 +import ray # noqa: E402 + + +class BoostingModel: + def __init__(self): + with open("/tmp/iris_model_logistic_regression.pkl", "rb") as f: + self.model = pickle.load(f) + with open("/tmp/iris_labels.json") as f: + self.label_list = json.load(f) + + def __call__(self, flask_request): + payload = flask_request.json + print("Worker: received flask request with data", payload) + + input_vector = [ + payload["sepal length"], + payload["sepal width"], + payload["petal length"], + payload["petal width"], + ] + prediction = self.model.predict([input_vector])[0] + human_name = self.label_list[prediction] + return {"result": human_name} + + +# connect to our existing Ray cluster +# note that the password will be different for your redis instance! +ray.init(address="auto") +# now we initialize /connect to the Ray service + +serve.init() +serve.create_endpoint("iris_classifier", "/regressor") +serve.create_backend("lr:v1", BoostingModel) +serve.set_traffic("iris_classifier", {"lr:v1": 1, "version": "v1"}) +# __doc_create_deploy_end__ + +# __doc_query_begin__ +import requests # noqa: E402 + +sample_request_input = { + "sepal length": 1.2, + "sepal width": 1.0, + "petal length": 1.1, + "petal width": 0.9, +} +response = requests.get( + "http://localhost:8000/regressor", json=sample_request_input) +print(response.text) +# Result: +# { +# "result": "setosa", +# "version": "v1" +# } +# this result may vary, since the training parameters may change. +# as we update this model, this result will also change over time. +# __doc_query_end__ + + +# __doc_create_deploy_2_begin__ +import pickle # noqa: E402 +import json # noqa: E402 +import numpy as np # noqa: E402 + +from sklearn.datasets import load_iris # noqa: E402 +from sklearn.ensemble import GradientBoostingClassifier # noqa: E402 +from sklearn.metrics import mean_squared_error # noqa: E402 + +# Load data +iris_dataset = load_iris() +data, target, target_names = iris_dataset["data"], iris_dataset[ + "target"], iris_dataset["target_names"] + +# Instantiate model +model = GradientBoostingClassifier() + +# Training and validation split +np.random.shuffle(data), np.random.shuffle(target) +train_x, train_y = data[:100], target[:100] +val_x, val_y = data[100:], target[100:] + +# Train and evaluate models +model.fit(train_x, train_y) +print("MSE:", mean_squared_error(model.predict(val_x), val_y)) + +# Save the model and label to file +with open("/tmp/iris_model_logistic_regression_2.pkl", "wb") as f: + pickle.dump(model, f) +with open("/tmp/iris_labels_2.json", "w") as f: + json.dump(target_names.tolist(), f) + + +import pickle # noqa: E402 +import json # noqa: E402 + +from ray import serve # noqa: E402 +import ray # noqa: E402 + + +class BoostingModelv2: + def __init__(self): + with open("/tmp/iris_model_logistic_regression_2.pkl", "rb") as f: + self.model = pickle.load(f) + with open("/tmp/iris_labels_2.json") as f: + self.label_list = json.load(f) + + def __call__(self, flask_request): + payload = flask_request.json + print("Worker: received flask request with data", payload) + + input_vector = [ + payload["sepal length"], + payload["sepal width"], + payload["petal length"], + payload["petal width"], + ] + prediction = self.model.predict([input_vector])[0] + human_name = self.label_list[prediction] + return {"result": human_name, "version": "v2"} + +# connect to our existing Ray cluster +# note that the password will be different for your redis instance! +# ray.init(address='auto', redis_password='5241590000000000') +# now we initialize /connect to the Ray service + + +serve.init() +serve.create_backend("lr:v2", BoostingModelv2) +serve.set_traffic("iris_classifier", {"lr:v2": 0.25, "lr:v1": 0.75}) +# __doc_create_deploy_2_end__