From fadd47e44eff41812c07af78fc7f5f0703c678e4 Mon Sep 17 00:00:00 2001
From: Bill Chambers <bill@anyscale.com>
Date: Wed, 27 May 2020 09:03:28 -0700
Subject: [PATCH] [docs] Ray Serve Documentation Overhaul (#8524)

---
 doc/source/index.rst                          |   9 +-
 doc/source/serve/advanced.rst                 | 149 +++++++++
 doc/source/serve/deployment.rst               | 171 ++++++++++
 doc/source/serve/index.rst                    |  93 ++++++
 doc/source/serve/key-concepts.rst             |  84 +++++
 doc/source/serve/overview.rst                 | 306 ------------------
 doc/source/serve/tutorials/index.rst          |  19 ++
 .../{pytorch-tutorial.rst => pytorch.rst}     |   4 +-
 .../{sklearn-tutorial.rst => sklearn.rst}     |   4 +-
 ...tensorflow-tutorial.rst => tensorflow.rst} |   4 +-
 python/ray/serve/BUILD                        |   9 +
 .../ray/serve/examples/doc/tutorial_deploy.py | 169 ++++++++++
 12 files changed, 705 insertions(+), 316 deletions(-)
 create mode 100644 doc/source/serve/advanced.rst
 create mode 100644 doc/source/serve/deployment.rst
 create mode 100644 doc/source/serve/index.rst
 create mode 100644 doc/source/serve/key-concepts.rst
 delete mode 100644 doc/source/serve/overview.rst
 create mode 100644 doc/source/serve/tutorials/index.rst
 rename doc/source/serve/tutorials/{pytorch-tutorial.rst => pytorch.rst} (89%)
 rename doc/source/serve/tutorials/{sklearn-tutorial.rst => sklearn.rst} (89%)
 rename doc/source/serve/tutorials/{tensorflow-tutorial.rst => tensorflow.rst} (90%)
 create mode 100644 python/ray/serve/examples/doc/tutorial_deploy.py

diff --git a/doc/source/index.rst b/doc/source/index.rst
index a3aefa20f..0789c7ddc 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -187,10 +187,11 @@ Getting Involved
    :maxdepth: -1
    :caption: Ray Serve
 
-   serve/overview.rst
-   serve/tutorials/tensorflow-tutorial.rst
-   serve/tutorials/pytorch-tutorial.rst
-   serve/tutorials/sklearn-tutorial.rst
+   serve/index.rst
+   serve/key-concepts.rst
+   serve/tutorials/index.rst
+   serve/deployment.rst
+   serve/advanced.rst
 
 .. toctree::
    :maxdepth: -1
diff --git a/doc/source/serve/advanced.rst b/doc/source/serve/advanced.rst
new file mode 100644
index 000000000..2081e2df6
--- /dev/null
+++ b/doc/source/serve/advanced.rst
@@ -0,0 +1,149 @@
+======================================
+Advanced Topics, Configurations, & FAQ
+======================================
+
+Ray Serve has a number of knobs and tools for you to tune for your particular workload. 
+All Ray Serve advanced options and topics are covered on this page aside from the 
+fundamentals of :doc:`deployment`. For a more hands on take, please check out the :ref:`serve-tutorials`.
+
+There are a number of things you'll likely want to do with your serving application including
+scaling out, splitting traffic, or batching input for better performance. To do all of this,
+you will create a ``BackendConfig``, a configuration object that you'll use to set 
+the properties of a particular backend.
+
+.. contents::
+
+Scaling Out
+===========
+
+To scale out a backend to multiple workers, simplify configure the number of replicas.
+
+.. code-block:: python
+
+  config = {"num_replicas": 10}
+  serve.create_backend("my_scaled_endpoint_backend", handle_request, config=config)
+
+  # scale it back down...
+  config = {"num_replicas": 2}
+  serve.set_backend_config("my_scaled_endpoint_backend", handle_request, config=config)
+
+This will scale up or down the number of workers that can accept requests.
+
+Using Resources (CPUs, GPUs)
+============================
+
+To assign hardware resource per worker, you can pass resource requirements to
+``ray_actor_options``. To learn about options to pass in, take a look at
+:ref:`Resources with Actor<actor-resource-guide>` guide.
+
+For example, to create a backend where each replica uses a single GPU, you can do the
+following:
+
+.. code-block:: python
+
+  config = {"num_gpus": 1}
+  serve.create_backend("my_gpu_backend", handle_request, ray_actor_options=config)
+
+.. note::
+
+  Deep learning models like PyTorch and Tensorflow often use all the CPUs when
+  performing inference. Ray sets the environment variable ``OMP_NUM_THREADS=1`` to
+  :ref:`avoid contention<omp-num-thread-note>`. This means each worker will only
+  use one CPU instead of all of them.
+
+.. _serve-batching:
+
+Batching to improve performance
+===============================
+
+You can also have Ray Serve batch requests for performance. In order to do use this feature, you need to:
+1. Set the `max_batch_size` in the `BackendConfig`.
+2. Modify your backend implementation to accept a list of requests and return a list of responses instead of handling a single request.
+
+
+.. code-block:: python
+
+  class BatchingExample:
+      def __init__(self):
+          self.count = 0
+
+      @serve.accept_batch
+      def __call__(self, requests):
+          responses = []
+              for request in requests:
+                  responses.append(request.json())
+          return responses
+
+  serve.create_endpoint("counter1", "/increment")
+
+  config = {"max_batch_size": 5}
+  serve.create_backend("counter1", BatchingExample, config=config)
+  serve.set_traffic("counter1", {"counter1": 1.0})
+
+.. _`serve-split-traffic`:
+
+Splitting Traffic and A/B Testing
+==================================
+
+It's trivial to also split traffic, simply specify the endpoint and the backends that you want to split.
+
+.. code-block:: python
+  
+  serve.create_endpoint("endpoint_identifier_split", "/split", methods=["GET", "POST"])
+
+  # splitting traffic 70/30
+  serve.set_traffic("endpoint_identifier_split", {"my_endpoint_backend": 0.7, "my_endpoint_backend_class": 0.3})
+
+While splitting traffic is general simple, at times you'll want to consider :ref:`session-affinity`, making it easy to
+control what users see which version of the model. See the docs on :ref:`session-affinity` for more information.
+
+.. _session-affinity:
+
+Session Affinity
+================
+
+In some cases, you may want to ensure that requests from the same client, user, etc. get mapped to the same backend.
+To do this, you can specify a "shard key" that will deterministically map requests to a backend.
+The shard key can either be specified via the X-SERVE-SHARD-KEY HTTP header or ``handle.options(shard_key="key")``.
+
+.. note:: The mapping from shard key to backend may change when you update the traffic policy for an endpoint.
+
+.. code-block:: python
+
+  # Specifying the shard key via an HTTP header.
+  requests.get("127.0.0.1:8000/api", headers={"X-SERVE-SHARD-KEY": session_id})
+
+  # Specifying the shard key in a call made via serve handle.
+  handle = serve.get_handle("api_endpoint")
+  handler.options(shard_key=session_id).remote(args)
+
+
+.. _serve-faq:
+
+Ray Serve FAQ
+=============
+
+How do I deploy serve?
+----------------------
+
+See :doc:`deployment` for information about how to deploy serve.
+
+How do I delete backends and endpoints?
+---------------------------------------
+
+To delete a backend, you can use `serve.delete_backend`.
+Note that the backend must not be use by any endpoints in order to be delete.
+Once a backend is deleted, its tag can be reused.
+
+.. code-block:: python
+
+  serve.delete_backend("simple_backend")
+
+
+To delete a endpoint, you can use `serve.delete_endpoint`.
+Note that the endpoint will no longer work and return a 404 when queried.
+Once a endpoint is deleted, its tag can be reused.
+
+.. code-block:: python
+
+  serve.delete_endpoint("simple_endpoint")
diff --git a/doc/source/serve/deployment.rst b/doc/source/serve/deployment.rst
new file mode 100644
index 000000000..683af72ed
--- /dev/null
+++ b/doc/source/serve/deployment.rst
@@ -0,0 +1,171 @@
+===================
+Deploying Ray Serve
+===================
+
+In the :doc:`key-concepts`, you saw some of the basics of how to write serve applications.
+This section will dive a bit deeper into how Ray Serve runs on a Ray cluster and how you're able 
+to deploy and update your serve application over time.
+
+To deploy a Ray Serve application (and cluster) you're going to need several things.
+
+1. A running Ray cluster (you can deploy one on your local machine for testing).
+2. A Ray Serve cluster To learn more about Ray clusters see :doc:`../cluster-index`.
+3. Your Ray Serve endpoint(s) and backend(s).
+
+.. contents:: Deploying Ray Serve
+
+.. _serve-deploy-tutorial:
+
+Deploying a Model with Ray Serve
+================================
+
+Let's get started deploying our first Ray Serve application. The first thing you'll need
+to do is start a Ray cluster. You can do that using the Ray autoscaler, but in our case
+we'll create it on our local machine. To learn more about Ray Clusters see :doc:`../cluster-index`.
+
+Starting the Cluster
+--------------------
+We do that by running:
+
+.. code::
+
+    ray start --head
+
+That starts a cluster on our local machine. We can shut that down by running ``ray stop``. You should 
+run this after we complete this tutorial. 
+
+Setup: Training a Model
+-----------------------
+
+Make sure you install `Scikit-learn <https://scikit-learn.org/stable/>`_.
+
+Place the following in a python script and run it. In this example we're training
+a model and saving it to disk for us to load into our Ray Serve app.
+
+.. literalinclude:: ../../../python/ray/serve/examples/doc/tutorial_deploy.py
+    :start-after: __doc_import_train_begin__
+    :end-before: __doc_import_train_end__
+
+As discussed in other :doc:`tutorials/index`, we can use any framework to build these models. In general,
+you'll just want to have the ability to persist these models to disk.
+
+Now that we've trained that model and saved it to disk (keep in mind this could also be a service like S3),
+we'll need to create a backend to serve the model.
+
+Creating a Model and Serving it
+-------------------------------
+
+In the following snippet we will complete two things:
+1. Define a servable model by instantiating a class and defining the ``__call__`` method.
+2. Connect to our running Ray cluster(``ray.init(...)``) and then start or connect to the Ray Serve service
+on that cluster(``serve.init(...)``).
+
+
+You can see that defining the model is straightforward and simple, we're simply instantiating
+the model like we might a typical Python class.
+
+Configuring our model to accept traffic is specified via ``.set_traffic`` after we created
+a backend in serve for our model (and versioned it with a string).
+
+.. literalinclude:: ../../../python/ray/serve/examples/doc/tutorial_deploy.py
+    :start-after: __doc_create_deploy_begin__
+    :end-before: __doc_create_deploy_end__
+
+What serve does when we run this code is store the model as a Ray actor 
+and route traffic to it as the endpoint is queried, in this case over HTTP.
+
+Let's now query our endpoint to see the result.
+
+Querying our Endpoint
+---------------------
+
+We'll use the requests library to query our endpoint and be able to get a result.
+
+.. literalinclude:: ../../../python/ray/serve/examples/doc/tutorial_deploy.py
+    :start-after: __doc_query_begin__
+    :end-before: __doc_query_end__
+
+
+Now that we defined a model and have it running on our Ray cluster. Let's proceed with updating
+this model with a new set of code.
+
+Updating Your Model Over Time
+=============================
+
+Updating our model is as simple as deploying the first one. While the code snippet includes 
+a lot of information, all that we're doing is we are defining a new model, saving it, then loading 
+it into serve. The key lines are at the end.
+
+.. literalinclude:: ../../../python/ray/serve/examples/doc/tutorial_deploy.py
+    :start-after: __doc_create_deploy_2_begin__
+    :end-before: __doc_create_deploy_2_end__
+
+Consequentially, since Ray Serve runs as a service, all we need to tell it is that (a) there's a new model
+and (b) how much traffic we should send to that model (and from what endpoint).
+
+We do that with the line at the end of the code snippet, which allows us to split traffic between
+these two models.
+
+.. code::
+
+    serve.set_traffic("iris_classifier", {"lr:v2": 0.25, "lr:v1": 0.75})
+
+While this is a simple operation, you may want to see :ref:`serve-split-traffic` for more information. 
+One thing you may want to consider as well is
+:ref:`session-affinity` which gives you the ability to ensure that queries from users/clients always get mapped to the same backend.
+versions.
+
+Now that we're up and running serving two models in production, let's query 
+our results several times to see some results. You'll notice that we're now splitting
+traffic between these two different models.
+
+Querying our Endpoint
+---------------------
+
+We'll use the requests library to query our endpoint and be able to get a result.
+
+.. literalinclude:: ../../../python/ray/serve/examples/doc/tutorial_deploy.py
+    :start-after: __doc_query_begin__
+    :end-before: __doc_query_end__
+
+If you run this code several times, you'll notice that the output will change - this 
+is due to us running the two models in parallel that we created above.
+
+Upon concluding the above tutorial, you'll want to run ``ray stop`` to 
+shutdown the Ray cluster on your local machine.
+
+Deployment FAQ
+==============
+
+Best practices for local development
+------------------------------------
+
+One thing you may notice is that we never have to declare a ``while True`` loop or 
+something to keep the Ray Serve process running. In general, we don't recommend using forever loops and therefore 
+opt for launching a Ray Cluster locally. Specify a Ray cluster like we did in :ref:`serve-deploy-tutorial`.
+To learn more, in general, about Ray Clusters see :doc:`../cluster-index`.
+
+
+Deploying Multiple Serve Clusters on a Single Ray Cluster
+---------------------------------------------------------
+
+You can run multiple serve clusters on the same Ray cluster by providing a ``cluster_name`` to ``serve.init()``.
+
+.. code-block:: python
+
+  # Create a first cluster whose HTTP server listens on 8000.
+  serve.init(cluster_name="cluster1", http_port=8000)
+  serve.create_endpoint("counter1", "/increment")
+
+  # Create a second cluster whose HTTP server listens on 8001.
+  serve.init(cluster_name="cluster2", http_port=8001)
+  serve.create_endpoint("counter1", "/increment")
+
+  # Create a backend that will be served on the second cluster.
+  serve.create_backend("counter1", function)
+  serve.set_traffic("counter1", {"counter1": 1.0})
+
+  # Switch back the the first cluster and create the same backend on it.
+  serve.init(cluster_name="cluster1")
+  serve.create_backend("counter1", function)
+  serve.set_traffic("counter1", {"counter1": 1.0})
diff --git a/doc/source/serve/index.rst b/doc/source/serve/index.rst
new file mode 100644
index 000000000..17d470c62
--- /dev/null
+++ b/doc/source/serve/index.rst
@@ -0,0 +1,93 @@
+.. _rayserve:
+
+============================================
+Ray Serve: Scalable and Programmable Serving
+============================================
+
+.. image:: logo.svg
+    :align: center
+    :height: 250px
+    :width: 400px
+
+.. _rayserve-overview:
+
+Ray Serve is a scalable model-serving library built on Ray.
+
+For users, Ray Serve is:
+
+- **Framework Agnostic**:Use the same toolkit to serve everything from deep learning models 
+  built with frameworks like :ref:`PyTorch <serve-pytorch-tutorial>` or 
+  :ref:`Tensorflow & Keras <serve-tensorflow-tutorial>` to :ref:`Scikit-Learn <serve-sklearn-tutorial>` models or arbitrary business logic.
+- **Python First**: Configure your model serving with pure Python code - no more YAMLs or 
+  JSON configs.
+
+As a library, Ray Serve enables: 
+
+- :ref:`serve-split-traffic` with zero downtime by decoupling routing logic from response handling logic.
+- :ref:`serve-batching` built-in to help you meet your performance objectives or use your model for batch and online processing.
+
+Since Ray is built on Ray, Ray Serve also allows you to **scale to many machines**
+and allows you to leverage all of the other Ray frameworks so you can deploy and scale on any cloud.
+
+.. note:: 
+  If you want to try out Serve, join our `community slack <https://forms.gle/9TSdDYUgxYs8SA9e8>`_ 
+  and discuss in the #serve channel.
+
+
+Installation
+============
+
+Ray Serve supports Python versions 3.5 and higher. To install Ray Serve:
+
+.. code-block:: bash
+
+  pip install "ray[serve]"
+
+Ray Serve in 90 Seconds
+=======================
+
+Serve a function by defining a function, an endpoint, and a backend (in this case a stateless function) then 
+connecting the two by setting traffic from the endpoint to the backend.
+
+.. literalinclude:: ../../../python/ray/serve/examples/doc/quickstart_function.py
+
+Serve a stateful class by defining a class (``Counter``), creating an endpoint and a backend, then connecting
+the two by setting traffic from the endpoint to the backend.
+
+.. literalinclude:: ../../../python/ray/serve/examples/doc/quickstart_class.py
+
+See :doc:`key-concepts` for more exhaustive coverage about Ray Serve and its core concepts.
+
+Why Ray Serve?
+==============
+
+There are generally two ways of serving machine learning applications, both with serious limitations:
+you can build using a **traditional webserver** - your own Flask app or you can use a cloud hosted solution.
+
+The first approach is easy to get started with, but it's hard to scale each component. The second approach
+requires vendor lock-in (SageMaker), framework specific tooling (TFServing), and a general
+lack of flexibility.
+
+Ray Serve solves these problems by giving a user the ability to leverage the simplicity
+of deployment of a simple webserver but handles the complex routing, scaling, and testing logic
+necessary for production deployments.
+
+For more on the motivation behind Ray Serve, check out these `meetup slides <https://tinyurl.com/serve-meetup>`_.
+
+When should I use Ray Serve?
+----------------------------
+
+Ray Serve is a simple (but flexible) tool for deploying, operating, and monitoring Python based machine learning models.
+Ray Serve excels when scaling out to serve models in production is a necessity. This might be because of large scale batch processing
+requirements or because you're going to serve a number of models behind different endpoints and may need to run A/B tests or control 
+traffic between different models.
+
+If you plan on running on multiple machines, Ray Serve will serve you well.
+
+What's next?
+============
+
+Check out the :doc:`key-concepts`, learn more about :doc:`advanced`, look at the :ref:`serve-faq`,
+or head over to the :doc:`tutorials/index` to get started building your Ray Serve Applications.
+
+
diff --git a/doc/source/serve/key-concepts.rst b/doc/source/serve/key-concepts.rst
new file mode 100644
index 000000000..e8a858238
--- /dev/null
+++ b/doc/source/serve/key-concepts.rst
@@ -0,0 +1,84 @@
+============
+Key Concepts
+============
+
+Ray Serve focuses on **simplicity** and only has two core concepts: endpoints and backends.
+
+To follow along, you'll need to make the necessary imports.
+
+.. code-block:: python
+
+  from ray import serve
+  serve.init() # Initializes Ray and Ray Serve.
+
+.. _serve-endpoint:
+
+Endpoints
+=========
+
+Endpoints allow you to name the "entity" that you'll be exposing, 
+the HTTP path that your application will expose. 
+Endpoints are "logical" and decoupled from the business logic or 
+model that you'll be serving. To create one, we'll simply specify the name, route, and methods.
+
+.. code-block:: python
+
+  serve.create_endpoint("simple_endpoint", "/simple")
+
+You can also delete an endpoint using `serve.delete_endpoint`.
+Note that this will not delete any associated backends, which can be reused for other endpoints.
+
+.. code-block:: python
+
+  serve.delete_endpoint("simple_endpoint")
+
+.. _serve-backend:
+
+Backends
+========
+
+Backends are the logical structures for your business logic or models and 
+how you specify what should happen when an endpoint is queried.
+To define a backend, first you must define the "handler" or the business logic you'd like to respond with. 
+The input to this request will be a `Flask Request object <https://flask.palletsprojects.com/en/1.1.x/api/?highlight=request#flask.Request>`_.
+Use a function when your response is stateless and a class when you
+might need to maintain some state (like a model). 
+For both functions and classes (that take as input Flask Requests), you'll need to 
+define them as backends to Ray Serve.
+
+It's important to note that Ray Serve places these backends in individual worker processes, which are replicas of the model.
+
+.. code-block:: python
+  
+  def handle_request(flask_request):
+    return "hello world"
+
+  class RequestHandler:
+    def __init__(self):
+        self.msg = "hello, world!"
+
+    def __call__(self, flask_request):
+        return self.msg
+
+  serve.create_backend("simple_backend", handle_request)
+  serve.create_backend("simple_backend_class", RequestHandler)
+
+Setting Traffic
+===============
+
+Lastly, we need to route traffic the particular backend to the server endpoint. 
+To do that we'll use the ``set_traffic`` capability.
+A link is essentially a load-balancer and allow you to define queuing policies 
+for how you would like backends to be served via an endpoint.
+For instance, you can route 50% of traffic to Model A and 50% of traffic to Model B.
+
+.. code-block:: python
+
+  serve.set_traffic("simple_backend", {"simple_endpoint": 1.0})
+
+Once we've done that, we can now query our endpoint via HTTP (we use `requests` to make HTTP calls here).
+
+.. code-block:: python
+  
+  import requests
+  print(requests.get("http://127.0.0.1:8000/-/routes", timeout=0.5).text)
diff --git a/doc/source/serve/overview.rst b/doc/source/serve/overview.rst
deleted file mode 100644
index 62998ed5a..000000000
--- a/doc/source/serve/overview.rst
+++ /dev/null
@@ -1,306 +0,0 @@
-.. _rayserve:
-
-Ray Serve: Scalable and Programmable Serving
-============================================
-
-.. image:: logo.svg
-    :align: center
-    :height: 250px
-    :width: 400px
-
-.. _rayserve-overview:
-
-Overview
---------
-
-Ray Serve is a scalable model-serving library built on Ray.
-
-For users Ray Serve is:
-
-- **Framework Agnostic**:Use the same toolkit to serve everything from deep learning models 
-  built with frameworks like PyTorch or TensorFlow to scikit-learn models or arbitrary business logic.
-- **Python First**: Configure your model serving with pure Python code - no more YAMLs or 
-  JSON configs.
-
-Ray Serve enables: 
-
--  **A/B test models** with zero downtime by decoupling routing logic from response handling logic.
-- **Batching** built-in to help you meet your performance objectives.
-
-Since Ray is built on Ray, Ray Serve also allows you to **scale to many machines**
-and allows you to leverage all of the other Ray frameworks so you can deploy and scale on any cloud.
-
-.. note:: 
-  If you want to try out Serve, join our `community slack <https://forms.gle/9TSdDYUgxYs8SA9e8>`_ 
-  and discuss in the #serve channel.
-
-
-Installation
-~~~~~~~~~~~~
-Ray Serve supports Python versions 3.5 and higher. To install Ray Serve:
-
-.. code-block:: bash
-
-  pip install "ray[serve]"
-
-
-
-Ray Serve in 90 Seconds
-~~~~~~~~~~~~~~~~~~~~~~~
-
-Serve a stateless function:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/quickstart_function.py
-
-Serve a stateful class:
-
-.. literalinclude:: ../../../python/ray/serve/examples/doc/quickstart_class.py
-
-See :ref:`serve-key-concepts` for more information about working with Ray Serve.
-
-Why Ray Serve?
-~~~~~~~~~~~~~~
-
-There are generally two ways of serving machine learning applications, both with serious limitations:
-you can build using a **traditional webserver** - your own Flask app or you can use a cloud hosted solution.
-
-The first approach is easy to get started with, but it's hard to scale each component. The second approach
-requires vendor lock-in (SageMaker), framework specific tooling (TFServing), and a general
-lack of flexibility.
-
-Ray Serve solves these problems by giving a user the ability to leverage the simplicity
-of deployment of a simple webserver but handles the complex routing, scaling, and testing logic
-necessary for production deployments.
-
-For more on the motivation behind Ray Serve, check out these `meetup slides <https://tinyurl.com/serve-meetup>`_.
-
-When should I use Ray Serve?
-++++++++++++++++++++++++++++
-
-Ray Serve should be used when you need to deploy at least one model, preferrably many models.  
-Ray Serve **won't work well** when you need to run batch prediction over a dataset. Given this use case, we recommend looking into `multiprocessing with Ray </multiprocessing.html>`_.
-
-.. _serve-key-concepts:
-
-Key Concepts
-------------
-
-Ray Serve focuses on **simplicity** and only has two core concepts: endpoints and backends.
-
-To follow along, you'll need to make the necessary imports.
-
-.. code-block:: python
-
-  from ray import serve
-  serve.init() # initializes serve and Ray
-
-.. _serve-endpoint:
-
-Endpoints
-~~~~~~~~~
-
-Endpoints allow you to name the "entity" that you'll be exposing, 
-the HTTP path that your application will expose. 
-Endpoints are "logical" and decoupled from the business logic or 
-model that you'll be serving. To create one, we'll simply specify the name, route, and methods.
-
-.. code-block:: python
-
-  serve.create_endpoint("simple_endpoint", "/simple")
-
-You can also delete an endpoint using `serve.delete_endpoint`.
-Note that this will not delete any associated backends, which can be reused for other endpoints.
-
-.. code-block:: python
-
-  serve.delete_endpoint("simple_endpoint")
-
-.. _serve-backend:
-
-Backends
-~~~~~~~~
-
-Backends are the logical structures for your business logic or models and 
-how you specify what should happen when an endpoint is queried.
-To define a backend, first you must define the "handler" or the business logic you'd like to respond with. 
-The input to this request will be a `Flask Request object <https://flask.palletsprojects.com/en/1.1.x/api/?highlight=request#flask.Request>`_.
-Once you define the function (or class) that will handle a request. 
-You'd use a function when your response is stateless and a class when you
-might need to maintain some state (like a model). 
-For both functions and classes (that take as input Flask Requests), you'll need to 
-define them as backends to Ray Serve.
-
-It's important to note that Ray Serve places these backends in individual workers, which are replicas of the model.
-
-.. code-block:: python
-  
-  def handle_request(flask_request):
-    return "hello world"
-
-  class RequestHandler:
-    def __init__(self):
-        self.msg = "hello, world!"
-
-    def __call__(self, flask_request):
-        return self.msg
-
-  serve.create_backend("simple_backend", handle_request)
-  serve.create_backend("simple_backend_class", RequestHandler)
-
-Lastly, we need to link the particular backend to the server endpoint. 
-To do that we'll use the ``link`` capability.
-A link is essentially a load-balancer and allow you to define queuing policies 
-for how you would like backends to be served via an endpoint.
-For instance, you can route 50% of traffic to Model A and 50% of traffic to Model B.
-
-.. code-block:: python
-
-  serve.set_traffic("simple_backend", {"simple_endpoint": 1.0})
-
-Once we've done that, we can now query our endpoint via HTTP (we use `requests` to make HTTP calls here).
-
-.. code-block:: python
-  
-  import requests
-  print(requests.get("http://127.0.0.1:8000/-/routes", timeout=0.5).text)
-
-To delete a backend, we can use `serve.delete_backend`.
-Note that the backend must not be use by any endpoints in order to be delete.
-Once a backend is deleted, its tag can be reused.
-
-.. code-block:: python
-
-  serve.delete_backend("simple_backend")
-
-Configuring Backends
-~~~~~~~~~~~~~~~~~~~~
-
-There are a number of things you'll likely want to do with your serving application including
-scaling out, splitting traffic, or batching input for better response performance. To do all of this,
-you will create a ``BackendConfig``, a configuration object that you'll use to set 
-the properties of a particular backend.
-
-Scaling Out
-+++++++++++
-
-To scale out a backend to multiple workers, simplify configure the number of replicas.
-
-.. code-block:: python
-
-  config = {"num_replicas": 2}
-  serve.create_backend("my_scaled_endpoint_backend", handle_request, config=config)
-
-This will scale out the number of workers that can accept requests.
-
-Using Resources (CPUs, GPUs)
-++++++++++++++++++++++++++++
-To assign hardware resource per worker, you can pass resource requirements to
-``ray_actor_options``. To learn about options to pass in, take a look at
-:ref:`Resources with Actor<actor-resource-guide>` guide.
-
-For example, to create a backend where each replica uses a single GPU, you can do the
-following:
-
-.. code-block:: python
-
-  options = {"num_gpus": 1}
-  serve.create_backend("my_gpu_backend", handle_request, ray_actor_options=options)
-
-.. note::
-
-  Deep learning models like PyTorch and Tensorflow often use all the CPUs when
-  performing inference. Ray sets the environment variable ``OMP_NUM_THREADS=1`` to
-  :ref:`avoid contention<omp-num-thread-note>`. This means each worker will only
-  use one CPU instead of all of them.
-
-Splitting Traffic
-+++++++++++++++++
-
-It's trivial to also split traffic, simply specify the endpoint and the backends that you want to split.
-
-.. code-block:: python
-  
-  serve.create_endpoint("endpoint_identifier_split", "/split", methods=["GET", "POST"])
-
-  # splitting traffic 70/30
-  serve.set_traffic("endpoint_identifier_split", {"my_endpoint_backend": 0.7, "my_endpoint_backend_class": 0.3})
-
-
-Batching
-++++++++
-
-You can also have Ray Serve batch requests for performance. You'll configure this in the backend config.
-
-.. code-block:: python
-
-  class BatchingExample:
-      def __init__(self):
-          self.count = 0
-
-      @serve.accept_batch
-      def __call__(self, flask_request):
-          self.count += 1
-          batch_size = serve.context.batch_size
-          return [self.count] * batch_size
-
-  serve.create_endpoint("counter1", "/increment")
-
-  config = {"max_batch_size": 5}
-  serve.create_backend("counter1", BatchingExample, config=config)
-  serve.set_traffic("counter1", {"counter1": 1.0})
-
-Session Affinity
-++++++++++++++++
-
-In some cases, you may want to ensure that requests from the same client, user, etc. get mapped to the same backend.
-To do this, you can specify a "shard key" that will deterministically map requests to a backend.
-The shard key can either be specified via the X-SERVE-SHARD-KEY HTTP header or ``handle.options(shard_key="key")``.
-
-.. note:: The mapping from shard key to backend may change when you update the traffic policy for an endpoint.
-
-.. code-block:: python
-
-  # Specifying the shard key via an HTTP header.
-  requests.get("127.0.0.1:8000/api", headers={"X-SERVE-SHARD-KEY": session_id})
-
-  # Specifying the shard key in a call made via serve handle.
-  handle = serve.get_handle("api_endpoint")
-  handler.options(shard_key=session_id).remote(args)
-
-Running Multiple Serve Clusters on one Ray Cluster
-++++++++++++++++++++++++++++++++++++++++++++++++++
-
-You can run multiple serve clusters on the same Ray cluster by providing a ``cluster_name`` to ``serve.init()``.
-
-.. code-block:: python
-
-  # Create a first cluster whose HTTP server listens on 8000.
-  serve.init(cluster_name="cluster1", http_port=8000)
-  serve.create_endpoint("counter1", "/increment")
-
-  # Create a second cluster whose HTTP server listens on 8001.
-  serve.init(cluster_name="cluster2", http_port=8001)
-  serve.create_endpoint("counter1", "/increment")
-
-  # Create a backend that will be served on the second cluster.
-  serve.create_backend("counter1", function)
-  serve.set_traffic("counter1", {"counter1": 1.0})
-
-  # Switch back the the first cluster and create the same backend on it.
-  serve.init(cluster_name="cluster1")
-  serve.create_backend("counter1", function)
-  serve.set_traffic("counter1", {"counter1": 1.0})
-
-Other Resources
----------------
-
-.. _serve_frameworks:
-
-Frameworks
-~~~~~~~~~~
-Ray Serve makes it easy to deploy models from all popular frameworks.
-Learn more about how to deploy your model in the following tutorials:
-
-- :ref:`Tensorflow & Keras <serve-tensorflow-tutorial>`
-- :ref:`PyTorch <serve-pytorch-tutorial>`
-- :ref:`Scikit-Learn <serve-sklearn-tutorial>`
diff --git a/doc/source/serve/tutorials/index.rst b/doc/source/serve/tutorials/index.rst
new file mode 100644
index 000000000..20d62dbee
--- /dev/null
+++ b/doc/source/serve/tutorials/index.rst
@@ -0,0 +1,19 @@
+=========
+Tutorials
+=========
+
+Below are a list of tutorials that you can use to learn more about the different pieces of 
+Ray Serve functionality and how to integrate different modeling frameworks.
+
+.. toctree::
+   :caption: Serve Tutorials
+   :name: serve-tutorials
+   :maxdepth: -1
+
+   tensorflow.rst
+   pytorch.rst
+   sklearn.rst
+
+
+Other Topics:
+- :doc:`../deployment`
\ No newline at end of file
diff --git a/doc/source/serve/tutorials/pytorch-tutorial.rst b/doc/source/serve/tutorials/pytorch.rst
similarity index 89%
rename from doc/source/serve/tutorials/pytorch-tutorial.rst
rename to doc/source/serve/tutorials/pytorch.rst
index fb2d2cd2c..214637557 100644
--- a/doc/source/serve/tutorials/pytorch-tutorial.rst
+++ b/doc/source/serve/tutorials/pytorch.rst
@@ -9,10 +9,10 @@ In particular, we show:
 - How to load the model from PyTorch's pre-trained modelzoo.
 - How to parse the JSON request, transform the payload and evaluated in the model.
 
-Please see the :ref:`overview <rayserve-overview>` to learn more general information about Ray Serve.
+Please see the :doc:`../key-concepts` to learn more general information about Ray Serve.
 
 This tutorial requires Pytorch and Torchvision installed in your system. Ray Serve
-is :ref:`framework agnostic <serve_frameworks>` and work with any version of PyTorch.
+is framework agnostic and work with any version of PyTorch.
 
 .. code-block:: bash
 
diff --git a/doc/source/serve/tutorials/sklearn-tutorial.rst b/doc/source/serve/tutorials/sklearn.rst
similarity index 89%
rename from doc/source/serve/tutorials/sklearn-tutorial.rst
rename to doc/source/serve/tutorials/sklearn.rst
index 3e0d5f745..752a49d34 100644
--- a/doc/source/serve/tutorials/sklearn-tutorial.rst
+++ b/doc/source/serve/tutorials/sklearn.rst
@@ -9,9 +9,9 @@ In particular, we show:
 - How to load the model from file system in your Ray Serve definition
 - How to parse the JSON request and evaluated in sklearn model
 
-Please see the :ref:`overview <rayserve-overview>` to learn more general information about Ray Serve.
+Please see the :doc:`../key-concepts` to learn more general information about Ray Serve.
 
-Ray Serve supports :ref:`arbitrary frameworks <serve_frameworks>`. You can use any version of sklearn.
+Ray Serve is framework agnostic. You can use any version of sklearn.
 
 .. code-block:: bash
 
diff --git a/doc/source/serve/tutorials/tensorflow-tutorial.rst b/doc/source/serve/tutorials/tensorflow.rst
similarity index 90%
rename from doc/source/serve/tutorials/tensorflow-tutorial.rst
rename to doc/source/serve/tutorials/tensorflow.rst
index e63b9a1f4..73bc577dc 100644
--- a/doc/source/serve/tutorials/tensorflow-tutorial.rst
+++ b/doc/source/serve/tutorials/tensorflow.rst
@@ -9,9 +9,9 @@ In particular, we show:
 - How to load the model from file system in your Ray Serve definition
 - How to parse the JSON request and evaluated in Tensorflow
 
-Please see the :ref:`overview <rayserve-overview>` to learn more general information about Ray Serve.
+Please see the :doc:`../key-concepts` to learn more general information about Ray Serve.
 
-Ray Serve makes it easy to deploy models from :ref:`all popular frameworks <serve_frameworks>`.
+Ray Serve is framework agnostic you can use any version of Tensorflow.
 However, for this tutorial, we use Tensorflow 2 and Keras. Please make sure you have
 Tensorflow 2 installed.
 
diff --git a/python/ray/serve/BUILD b/python/ray/serve/BUILD
index 124a6a7b4..e680e3561 100644
--- a/python/ray/serve/BUILD
+++ b/python/ray/serve/BUILD
@@ -88,3 +88,12 @@ py_test(
     tags = ["exclusive"],
     deps = [":serve_lib"]
 )
+
+py_test(
+    name = "tutorial_deploy",
+    size = "small",
+    srcs = glob(["examples/doc/*.py"]),
+    tags = ["exclusive"],
+    deps = [":serve_lib"]
+)
+
diff --git a/python/ray/serve/examples/doc/tutorial_deploy.py b/python/ray/serve/examples/doc/tutorial_deploy.py
new file mode 100644
index 000000000..d074ac597
--- /dev/null
+++ b/python/ray/serve/examples/doc/tutorial_deploy.py
@@ -0,0 +1,169 @@
+# yapf: disable
+# __doc_import_train_begin__
+import pickle
+import json
+import numpy as np
+
+from sklearn.datasets import load_iris
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.metrics import mean_squared_error
+
+# Load data
+iris_dataset = load_iris()
+data, target, target_names = iris_dataset["data"], iris_dataset[
+    "target"], iris_dataset["target_names"]
+
+# Instantiate model
+model = GradientBoostingClassifier()
+
+# Training and validation split
+np.random.shuffle(data), np.random.shuffle(target)
+train_x, train_y = data[:100], target[:100]
+val_x, val_y = data[100:], target[100:]
+
+# Train and evaluate models
+model.fit(train_x, train_y)
+print("MSE:", mean_squared_error(model.predict(val_x), val_y))
+
+# Save the model and label to file
+with open("/tmp/iris_model_logistic_regression.pkl", "wb") as f:
+    pickle.dump(model, f)
+with open("/tmp/iris_labels.json", "w") as f:
+    json.dump(target_names.tolist(), f)
+# __doc_import_train_end__
+
+
+# __doc_create_deploy_begin__
+import pickle  # noqa: E402
+import json  # noqa: E402
+
+from ray import serve  # noqa: E402
+import ray  # noqa: E402
+
+
+class BoostingModel:
+    def __init__(self):
+        with open("/tmp/iris_model_logistic_regression.pkl", "rb") as f:
+            self.model = pickle.load(f)
+        with open("/tmp/iris_labels.json") as f:
+            self.label_list = json.load(f)
+
+    def __call__(self, flask_request):
+        payload = flask_request.json
+        print("Worker: received flask request with data", payload)
+
+        input_vector = [
+            payload["sepal length"],
+            payload["sepal width"],
+            payload["petal length"],
+            payload["petal width"],
+        ]
+        prediction = self.model.predict([input_vector])[0]
+        human_name = self.label_list[prediction]
+        return {"result": human_name}
+
+
+# connect to our existing Ray cluster
+# note that the password will be different for your redis instance!
+ray.init(address="auto")
+# now we initialize /connect to the Ray service
+
+serve.init()
+serve.create_endpoint("iris_classifier", "/regressor")
+serve.create_backend("lr:v1", BoostingModel)
+serve.set_traffic("iris_classifier", {"lr:v1": 1, "version": "v1"})
+# __doc_create_deploy_end__
+
+# __doc_query_begin__
+import requests  # noqa: E402
+
+sample_request_input = {
+    "sepal length": 1.2,
+    "sepal width": 1.0,
+    "petal length": 1.1,
+    "petal width": 0.9,
+}
+response = requests.get(
+    "http://localhost:8000/regressor", json=sample_request_input)
+print(response.text)
+# Result:
+# {
+#  "result": "setosa",
+#  "version": "v1"
+# }
+# this result may vary, since the training parameters may change.
+# as we update this model, this result will also change over time.
+# __doc_query_end__
+
+
+# __doc_create_deploy_2_begin__
+import pickle  # noqa: E402
+import json  # noqa: E402
+import numpy as np  # noqa: E402
+
+from sklearn.datasets import load_iris  # noqa: E402
+from sklearn.ensemble import GradientBoostingClassifier  # noqa: E402
+from sklearn.metrics import mean_squared_error  # noqa: E402
+
+# Load data
+iris_dataset = load_iris()
+data, target, target_names = iris_dataset["data"], iris_dataset[
+    "target"], iris_dataset["target_names"]
+
+# Instantiate model
+model = GradientBoostingClassifier()
+
+# Training and validation split
+np.random.shuffle(data), np.random.shuffle(target)
+train_x, train_y = data[:100], target[:100]
+val_x, val_y = data[100:], target[100:]
+
+# Train and evaluate models
+model.fit(train_x, train_y)
+print("MSE:", mean_squared_error(model.predict(val_x), val_y))
+
+# Save the model and label to file
+with open("/tmp/iris_model_logistic_regression_2.pkl", "wb") as f:
+    pickle.dump(model, f)
+with open("/tmp/iris_labels_2.json", "w") as f:
+    json.dump(target_names.tolist(), f)
+
+
+import pickle  # noqa: E402
+import json  # noqa: E402
+
+from ray import serve  # noqa: E402
+import ray  # noqa: E402
+
+
+class BoostingModelv2:
+    def __init__(self):
+        with open("/tmp/iris_model_logistic_regression_2.pkl", "rb") as f:
+            self.model = pickle.load(f)
+        with open("/tmp/iris_labels_2.json") as f:
+            self.label_list = json.load(f)
+
+    def __call__(self, flask_request):
+        payload = flask_request.json
+        print("Worker: received flask request with data", payload)
+
+        input_vector = [
+            payload["sepal length"],
+            payload["sepal width"],
+            payload["petal length"],
+            payload["petal width"],
+        ]
+        prediction = self.model.predict([input_vector])[0]
+        human_name = self.label_list[prediction]
+        return {"result": human_name, "version": "v2"}
+
+# connect to our existing Ray cluster
+# note that the password will be different for your redis instance!
+# ray.init(address='auto', redis_password='5241590000000000')
+# now we initialize /connect to the Ray service
+
+
+serve.init()
+serve.create_backend("lr:v2", BoostingModelv2)
+serve.set_traffic("iris_classifier", {"lr:v2": 0.25, "lr:v1": 0.75})
+# __doc_create_deploy_2_end__