From 6676b30eeffc4f93445f7f6ef0f7c556eebe8ad9 Mon Sep 17 00:00:00 2001
From: architkulkarni <architkulkarni@users.noreply.github.com>
Date: Wed, 7 Oct 2020 10:57:40 -0700
Subject: [PATCH] [Serve] Serve multi node tests (#10980)

---
 ci/long_running_tests/workloads/serve.py      |  14 +-
 .../baselines/2020-10-07/handle.txt           |   9 +
 .../baselines/2020-10-07/microbenchmark.txt   |  37 ++++
 .../baselines/2020-10-07/scalability.txt      | 209 ++++++++++++++++++
 python/ray/serve/benchmarks/cluster.yaml      |  45 ++++
 python/ray/serve/benchmarks/handle.py         |  86 +++++++
 python/ray/serve/benchmarks/microbenchmark.py |  67 ++++--
 python/ray/serve/benchmarks/scalability.py    | 119 ++++++++++
 8 files changed, 553 insertions(+), 33 deletions(-)
 create mode 100644 python/ray/serve/benchmarks/baselines/2020-10-07/handle.txt
 create mode 100644 python/ray/serve/benchmarks/baselines/2020-10-07/microbenchmark.txt
 create mode 100644 python/ray/serve/benchmarks/baselines/2020-10-07/scalability.txt
 create mode 100644 python/ray/serve/benchmarks/cluster.yaml
 create mode 100644 python/ray/serve/benchmarks/handle.py
 create mode 100644 python/ray/serve/benchmarks/scalability.py

diff --git a/ci/long_running_tests/workloads/serve.py b/ci/long_running_tests/workloads/serve.py
index 58f405c11..6d404ac51 100644
--- a/ci/long_running_tests/workloads/serve.py
+++ b/ci/long_running_tests/workloads/serve.py
@@ -24,13 +24,6 @@ for i in range(num_nodes):
         redis_max_memory=redis_max_memory,
         dashboard_host="0.0.0.0")
 
-print("Downloading load testing tool")
-subprocess.call([
-    "bash", "-c", "rm hey_linux_amd64 || true;"
-    "wget https://storage.googleapis.com/hey-release/hey_linux_amd64;"
-    "chmod +x hey_linux_amd64"
-])
-
 ray.init(address=cluster.address, dashboard_host="0.0.0.0")
 client = serve.start()
 
@@ -54,12 +47,15 @@ for _ in range(5):
     time.sleep(0.5)
 
 connections = int(config["num_replicas"] * config["max_batch_size"] * 0.75)
+num_threads = 2
+time_to_run = "60m"
 
 while True:
     proc = subprocess.Popen(
         [
-            "./hey_linux_amd64", "-c",
-            str(connections), "-z", "60m", "http://127.0.0.1:8000/echo"
+            "wrk", "-c",
+            str(connections), "-t",
+            str(num_threads), "-s", time_to_run, "http://127.0.0.1:8000/echo"
         ],
         stdout=PIPE,
         stderr=PIPE)
diff --git a/python/ray/serve/benchmarks/baselines/2020-10-07/handle.txt b/python/ray/serve/benchmarks/baselines/2020-10-07/handle.txt
new file mode 100644
index 000000000..ca4f94a28
--- /dev/null
+++ b/python/ray/serve/benchmarks/baselines/2020-10-07/handle.txt
@@ -0,0 +1,9 @@
+0 forwarders and 1 worker replicas: 1282 requests/s
+0 forwarders and 5 worker replicas: 1375 requests/s
+0 forwarders and 10 worker replicas: 1362 requests/s
+1 forwarders and 1 worker replicas: 608 requests/s
+1 forwarders and 5 worker replicas: 626 requests/s
+1 forwarders and 10 worker replicas: 627 requests/s
+2 forwarders and 1 worker replicas: 609 requests/s
+2 forwarders and 5 worker replicas: 620 requests/s
+2 forwarders and 10 worker replicas: 609 requests/s
\ No newline at end of file
diff --git a/python/ray/serve/benchmarks/baselines/2020-10-07/microbenchmark.txt b/python/ray/serve/benchmarks/baselines/2020-10-07/microbenchmark.txt
new file mode 100644
index 000000000..e7d07e5bd
--- /dev/null
+++ b/python/ray/serve/benchmarks/baselines/2020-10-07/microbenchmark.txt
@@ -0,0 +1,37 @@
+2020-10-07 09:20:47,110 INFO worker.py:634 -- Connecting to existing Ray cluster at address: 172.31.146.145:6379
+{'max_batch_size': 1, 'max_concurrent_queries': 1, 'num_replicas': 1}, intermediate_handles=False:
+        single client small data 94.94 +- 0.41 requests/s
+        8 clients small data 480.75 +- 28.58 requests/s
+{'max_batch_size': 1, 'max_concurrent_queries': 10000, 'num_replicas': 1}, intermediate_handles=False:
+        single client small data 94.14 +- 1.89 requests/s
+        8 clients small data 457.8 +- 8.53 requests/s
+{'max_batch_size': 10000, 'max_concurrent_queries': 10000, 'num_replicas': 1}, intermediate_handles=False:
+        single client small data 95.34 +- 0.81 requests/s
+        8 clients small data 455.35 +- 2.0 requests/s
+{'max_batch_size': 1, 'max_concurrent_queries': 1, 'num_replicas': 8}, intermediate_handles=False:
+        single client small data 77.61 +- 1.22 requests/s
+        8 clients small data 623.41 +- 5.7 requests/s
+{'max_batch_size': 1, 'max_concurrent_queries': 10000, 'num_replicas': 8}, intermediate_handles=False:
+        single client small data 77.38 +- 0.73 requests/s
+        8 clients small data 642.45 +- 9.81 requests/s
+{'max_batch_size': 10000, 'max_concurrent_queries': 10000, 'num_replicas': 8}, intermediate_handles=False:
+        single client small data 77.99 +- 2.44 requests/s
+        8 clients small data 636.86 +- 10.21 requests/s
+{'max_batch_size': 1, 'max_concurrent_queries': 1, 'num_replicas': 1}, intermediate_handles=True:
+        single client small data 52.91 +- 0.18 requests/s
+        8 clients small data 311.4 +- 1.87 requests/s
+{'max_batch_size': 1, 'max_concurrent_queries': 10000, 'num_replicas': 1}, intermediate_handles=True:
+        single client small data 52.68 +- 0.45 requests/s
+        8 clients small data 311.67 +- 2.45 requests/s
+{'max_batch_size': 10000, 'max_concurrent_queries': 10000, 'num_replicas': 1}, intermediate_handles=True:
+        single client small data 52.79 +- 0.5 requests/s
+        8 clients small data 309.75 +- 3.03 requests/s
+{'max_batch_size': 1, 'max_concurrent_queries': 1, 'num_replicas': 8}, intermediate_handles=True:
+        single client small data 49.92 +- 0.52 requests/s
+        8 clients small data 295.44 +- 2.01 requests/s
+{'max_batch_size': 1, 'max_concurrent_queries': 10000, 'num_replicas': 8}, intermediate_handles=True:
+        single client small data 49.74 +- 0.74 requests/s
+        8 clients small data 296.69 +- 1.84 requests/s
+{'max_batch_size': 10000, 'max_concurrent_queries': 10000, 'num_replicas': 8}, intermediate_handles=True:
+        single client small data 49.57 +- 0.36 requests/s
+        8 clients small data 293.47 +- 1.88 requests/s
\ No newline at end of file
diff --git a/python/ray/serve/benchmarks/baselines/2020-10-07/scalability.txt b/python/ray/serve/benchmarks/baselines/2020-10-07/scalability.txt
new file mode 100644
index 000000000..978a0b076
--- /dev/null
+++ b/python/ray/serve/benchmarks/baselines/2020-10-07/scalability.txt
@@ -0,0 +1,209 @@
+2020-10-07 09:15:57,387	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 1 of 21:
+2020-10-07 09:15:57,388	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   263.96ms   96.29ms 506.39ms   69.14%
+    Req/Sec   115.63     79.29   650.00     75.40%
+  2307 requests in 10.00s, 315.66KB read
+Requests/sec:    230.61
+Transfer/sec:     31.55KB
+
+2020-10-07 09:15:57,388	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 2 of 21:
+2020-10-07 09:15:57,389	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   282.79ms   75.00ms 500.26ms   63.32%
+    Req/Sec   108.20     60.17   240.00     58.92%
+  2159 requests in 10.02s, 295.42KB read
+Requests/sec:    215.47
+Transfer/sec:     29.48KB
+
+2020-10-07 09:15:57,389	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 3 of 21:
+2020-10-07 09:15:57,390	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   284.60ms   75.70ms 505.43ms   63.33%
+    Req/Sec   108.37     59.15   240.00     63.39%
+  2149 requests in 10.00s, 294.43KB read
+Requests/sec:    214.80
+Transfer/sec:     29.43KB
+
+2020-10-07 09:15:57,390	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 4 of 21:
+2020-10-07 09:15:57,391	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   282.22ms   77.38ms 508.22ms   65.28%
+    Req/Sec   108.98     61.47   250.00     62.98%
+  2163 requests in 10.03s, 295.97KB read
+Requests/sec:    215.60
+Transfer/sec:     29.50KB
+
+2020-10-07 09:15:57,392	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 5 of 21:
+2020-10-07 09:15:57,392	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   278.77ms   79.19ms 509.73ms   64.19%
+    Req/Sec   109.77     62.22   303.00     67.03%
+  2195 requests in 10.01s, 300.35KB read
+Requests/sec:    219.32
+Transfer/sec:     30.01KB
+
+2020-10-07 09:15:57,395	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 6 of 21:
+2020-10-07 09:15:57,396	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   267.82ms   93.51ms 500.75ms   67.95%
+    Req/Sec   114.75     72.64   480.00     69.57%
+  2281 requests in 10.01s, 312.35KB read
+Requests/sec:    227.96
+Transfer/sec:     31.22KB
+
+2020-10-07 09:15:57,396	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 7 of 21:
+2020-10-07 09:15:57,397	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   285.04ms   75.40ms 492.30ms   63.57%
+    Req/Sec   108.51     61.33   242.00     58.06%
+  2144 requests in 10.00s, 293.50KB read
+Requests/sec:    214.32
+Transfer/sec:     29.34KB
+
+2020-10-07 09:15:57,397	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 8 of 21:
+2020-10-07 09:15:57,398	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   271.42ms   89.56ms 508.15ms   68.90%
+    Req/Sec   113.23     66.06   340.00     65.95%
+  2254 requests in 10.01s, 308.16KB read
+Requests/sec:    225.13
+Transfer/sec:     30.78KB
+
+2020-10-07 09:15:57,399	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 9 of 21:
+2020-10-07 09:15:57,400	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   269.81ms   91.28ms 500.31ms   68.51%
+    Req/Sec   113.60     72.20   410.00     64.17%
+  2255 requests in 10.01s, 308.67KB read
+Requests/sec:    225.38
+Transfer/sec:     30.85KB
+
+2020-10-07 09:15:57,400	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 10 of 21:
+2020-10-07 09:15:57,401	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   283.93ms   74.67ms 507.84ms   64.11%
+    Req/Sec   108.93     55.89   242.00     62.09%
+  2154 requests in 10.01s, 294.86KB read
+Requests/sec:    215.24
+Transfer/sec:     29.46KB
+
+2020-10-07 09:15:57,401	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 11 of 21:
+2020-10-07 09:15:57,402	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   282.35ms   75.86ms 495.90ms   63.86%
+    Req/Sec   108.15     59.10   222.00     60.22%
+  2158 requests in 10.00s, 295.41KB read
+Requests/sec:    215.72
+Transfer/sec:     29.53KB
+
+2020-10-07 09:15:57,402	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 12 of 21:
+2020-10-07 09:15:57,403	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   283.37ms   75.04ms 499.01ms   63.79%
+    Req/Sec   108.77     61.15   262.00     60.00%
+  2157 requests in 10.01s, 295.03KB read
+Requests/sec:    215.55
+Transfer/sec:     29.48KB
+
+2020-10-07 09:15:57,406	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 13 of 21:
+2020-10-07 09:15:57,406	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   273.55ms   86.08ms 504.93ms   66.70%
+    Req/Sec   112.26     64.36   272.00     60.33%
+  2234 requests in 10.01s, 305.93KB read
+Requests/sec:    223.25
+Transfer/sec:     30.57KB
+
+2020-10-07 09:15:57,407	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 14 of 21:
+2020-10-07 09:15:57,407	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   279.76ms   78.28ms 506.39ms   65.57%
+    Req/Sec   109.14     62.56   272.00     60.75%
+  2181 requests in 10.00s, 298.93KB read
+Requests/sec:    218.03
+Transfer/sec:     29.88KB
+
+2020-10-07 09:15:57,408	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 15 of 21:
+2020-10-07 09:15:57,408	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   285.56ms   75.73ms 493.67ms   63.94%
+    Req/Sec   108.23     58.97   240.00     61.41%
+  2149 requests in 10.04s, 294.43KB read
+Requests/sec:    214.12
+Transfer/sec:     29.34KB
+
+2020-10-07 09:15:57,409	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 16 of 21:
+2020-10-07 09:15:57,410	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   275.84ms   83.78ms 504.36ms   66.76%
+    Req/Sec   110.52     61.73   290.00     63.64%
+  2208 requests in 10.00s, 302.00KB read
+Requests/sec:    220.72
+Transfer/sec:     30.19KB
+
+2020-10-07 09:15:57,410	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 17 of 21:
+2020-10-07 09:15:57,411	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   277.22ms   82.84ms 501.03ms   66.64%
+    Req/Sec   110.92     59.32   240.00     63.78%
+  2206 requests in 10.04s, 301.60KB read
+Requests/sec:    219.78
+Transfer/sec:     30.05KB
+
+2020-10-07 09:15:57,411	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 18 of 21:
+2020-10-07 09:15:57,412	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   282.49ms   76.79ms 495.28ms   64.60%
+    Req/Sec   108.16     61.20   250.00     61.62%
+  2161 requests in 10.04s, 296.44KB read
+Requests/sec:    215.19
+Transfer/sec:     29.52KB
+
+2020-10-07 09:15:57,414	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 19 of 21:
+2020-10-07 09:15:57,414	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   281.33ms   77.56ms 504.81ms   65.10%
+    Req/Sec   108.46     61.28   252.00     60.44%
+  2166 requests in 10.01s, 296.50KB read
+Requests/sec:    216.46
+Transfer/sec:     29.63KB
+
+2020-10-07 09:15:57,415	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 20 of 21:
+2020-10-07 09:15:57,416	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   284.18ms   74.78ms 506.23ms   62.89%
+    Req/Sec   109.16     60.05   240.00     59.46%
+  2153 requests in 10.01s, 294.85KB read
+Requests/sec:    215.16
+Transfer/sec:     29.47KB
+
+2020-10-07 09:15:57,417	INFO <ipython-input-1-1b7ee15d465b>:111 -- Results for node 21 of 21:
+2020-10-07 09:15:57,417	INFO <ipython-input-1-1b7ee15d465b>:112 -- Running 10s test @ http://127.0.0.1:8000/hey
+  2 threads and 63 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency   281.58ms   78.06ms 499.69ms   65.93%
+    Req/Sec   108.51     60.56   260.00     64.29%
+  2163 requests in 10.01s, 296.09KB read
+Requests/sec:    216.12
+Transfer/sec:     29.59KB
diff --git a/python/ray/serve/benchmarks/cluster.yaml b/python/ray/serve/benchmarks/cluster.yaml
new file mode 100644
index 000000000..70e6d9df4
--- /dev/null
+++ b/python/ray/serve/benchmarks/cluster.yaml
@@ -0,0 +1,45 @@
+cluster_name: default
+min_workers: 22
+max_workers: 22
+initial_workers: 22
+autoscaling_mode: default
+docker:
+    image: 'anyscale/ray-ml:latest'
+    container_name: ray_container
+    pull_before_run: true
+target_utilization_fraction: 0.8
+idle_timeout_minutes: 5
+provider:
+    type: aws
+    region: us-west-2
+    availability_zone: us-west-2a
+auth:
+    ssh_user: ubuntu
+head_node:
+    InstanceType: m5.xlarge
+    BlockDeviceMappings:
+        -
+            DeviceName: /dev/sda1
+            Ebs:
+                VolumeSize: 100
+worker_nodes:
+    InstanceType: m5.xlarge
+file_mounts: {}
+cluster_synced_files:
+    - /tmp/ray_tmp_mount/~/blank-serve-test
+file_mounts_sync_continuously: true
+initialization_commands: []
+setup_commands:
+    - apt-get install build-essential libssl-dev git -y
+    - 'rm -r wrk || true && git clone https://github.com/wg/wrk.git wrk && cd wrk && make && cp wrk /usr/local/bin'
+head_setup_commands: []
+worker_setup_commands: []
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
+worker_start_ray_commands:
+    - ray stop
+    - 'ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076'
+metadata:
+    anyscale:
+        working_dir: ~/blank-serve-test
diff --git a/python/ray/serve/benchmarks/handle.py b/python/ray/serve/benchmarks/handle.py
new file mode 100644
index 000000000..b8d243e88
--- /dev/null
+++ b/python/ray/serve/benchmarks/handle.py
@@ -0,0 +1,86 @@
+# A test that stresses the serve handles. We spin up a backend with a bunch
+# (0-2) of replicas that just forward requests to another backend.
+#
+# By comparing using the forward replicas with just calling the worker
+# replicas, we measure the (latency) overhead in the handle. This answers
+# the question: How much of a latency/throughput hit is there when I
+# compose models?
+#
+# By comparing the qps as we fix the number of forwarder replicas and vary the
+# number of worker replicas, we measure the limit of a single async actor. This
+# answers the question: How many "ForwardActor"s or these kinds of high-level
+# pipeline workers do I need to provision for my workload? every 1k qps,
+# 2k qps?
+#
+# Sample output:
+# 0 forwarders and 1 worker replicas: 1282 requests/s
+# 0 forwarders and 5 worker replicas: 1375 requests/s
+# 0 forwarders and 10 worker replicas: 1362 requests/s
+# 1 forwarders and 1 worker replicas: 608 requests/s
+# 1 forwarders and 5 worker replicas: 626 requests/s
+# 1 forwarders and 10 worker replicas: 627 requests/s
+# 2 forwarders and 1 worker replicas: 609 requests/s
+# 2 forwarders and 5 worker replicas: 620 requests/s
+# 2 forwarders and 10 worker replicas: 609 requests/s
+
+import ray
+from ray import serve
+from ray.serve import BackendConfig
+from ray.serve.utils import logger
+import time
+
+num_queries = 2000
+
+ray.init(address="auto")
+
+client = serve.start()
+
+
+def hello_world(_):
+    return b"Hello World"
+
+
+class ForwardActor:
+    def __init__(self):
+        client = serve.connect()
+        self.handle = client.get_handle("hello_world")
+
+    async def __call__(self, _):
+        await self.handle.remote()
+
+
+client.create_backend("hello_world", hello_world)
+client.create_endpoint("hello_world", backend="hello_world")
+
+client.create_backend("ForwardActor", ForwardActor)
+client.create_endpoint("ForwardActor", backend="ForwardActor")
+
+
+def run_test(num_replicas, num_forwarders):
+    replicas_config = BackendConfig(num_replicas=num_replicas)
+    client.update_backend_config("hello_world", replicas_config)
+
+    if (num_forwarders == 0):
+        handle = client.get_handle("hello_world")
+    else:
+        forwarders_config = BackendConfig(num_replicas=num_forwarders)
+        client.update_backend_config("ForwardActor", forwarders_config)
+        handle = client.get_handle("ForwardActor")
+
+    # warmup - helpful to wait for gc.collect() and actors to start
+    start = time.time()
+    while time.time() - start < 1:
+        ray.get(handle.remote())
+
+    # real test
+    start = time.time()
+    ray.get([handle.remote() for _ in range(num_queries)])
+    qps = num_queries / (time.time() - start)
+
+    logger.info("{} forwarders and {} worker replicas: {} requests/s".format(
+        num_forwarders, num_replicas, int(qps)))
+
+
+for num_forwarders in [0, 1, 2]:
+    for num_replicas in [1, 5, 10]:
+        run_test(num_replicas, num_forwarders)
diff --git a/python/ray/serve/benchmarks/microbenchmark.py b/python/ray/serve/benchmarks/microbenchmark.py
index de63581c9..e4f058e0f 100644
--- a/python/ray/serve/benchmarks/microbenchmark.py
+++ b/python/ray/serve/benchmarks/microbenchmark.py
@@ -1,3 +1,7 @@
+# Runs several scenarios with varying max batch size, max concurrent queries,
+# number of replicas, and with intermediate serve handles (to simulate ensemble
+# models) either on or off.
+
 import aiohttp
 import asyncio
 import time
@@ -79,33 +83,48 @@ async def trial(actors, session, data_size):
 async def main():
     ray.init(log_to_driver=False)
     client = serve.start()
-
     client.create_backend("backend", backend)
     client.create_endpoint("endpoint", backend="backend", route="/api")
+    for intermediate_handles in [False, True]:
+        if (intermediate_handles):
 
-    actors = [Client.remote() for _ in range(NUM_CLIENTS)]
-    for num_replicas in [1, 8]:
-        for backend_config in [
-            {
-                "max_batch_size": 1,
-                "max_concurrent_queries": 1
-            },
-            {
-                "max_batch_size": 1,
-                "max_concurrent_queries": 10000
-            },
-            {
-                "max_batch_size": 10000,
-                "max_concurrent_queries": 10000
-            },
-        ]:
-            backend_config["num_replicas"] = num_replicas
-            client.update_backend_config("backend", backend_config)
-            print(repr(backend_config) + ":")
-            async with aiohttp.ClientSession() as session:
-                # TODO(edoakes): large data causes broken pipe errors.
-                for data_size in ["small"]:
-                    await trial(actors, session, data_size)
+            client.create_endpoint(
+                "backend", backend="backend", route="/backend")
+
+            class forwardActor:
+                def __init__(self):
+                    self.handle = client.get_handle("backend")
+
+                def __call__(self, _):
+                    return ray.get(self.handle.remote())
+
+            client.create_backend("forwardActor", forwardActor)
+            client.set_traffic("endpoint", {"backend": 0, "forwardActor": 1})
+        actors = [Client.remote() for _ in range(NUM_CLIENTS)]
+        for num_replicas in [1, 8]:
+            for backend_config in [
+                {
+                    "max_batch_size": 1,
+                    "max_concurrent_queries": 1
+                },
+                {
+                    "max_batch_size": 1,
+                    "max_concurrent_queries": 10000
+                },
+                {
+                    "max_batch_size": 10000,
+                    "max_concurrent_queries": 10000
+                },
+            ]:
+                backend_config["num_replicas"] = num_replicas
+                client.update_backend_config("backend", backend_config)
+                print(
+                    repr(backend_config) + ", intermediate_handles=" +
+                    str(intermediate_handles) + ":")
+                async with aiohttp.ClientSession() as session:
+                    # TODO(edoakes): large data causes broken pipe errors.
+                    for data_size in ["small"]:
+                        await trial(actors, session, data_size)
 
 
 if __name__ == "__main__":
diff --git a/python/ray/serve/benchmarks/scalability.py b/python/ray/serve/benchmarks/scalability.py
new file mode 100644
index 000000000..d11564567
--- /dev/null
+++ b/python/ray/serve/benchmarks/scalability.py
@@ -0,0 +1,119 @@
+# A multi-node scalability test. We put an http proxy on the head node and spin
+# up 20 nodes and put as many replicas as possible on the cluster, and run a
+# stress test.
+#
+# Test will measure latency and throughput under a high load using `wrk`
+# running on each node.
+#
+# Results for node 1 of 21:
+# Running 10s test @ http://127.0.0.1:8000/hey
+#   2 threads and 63 connections
+#   Thread Stats   Avg      Stdev     Max   +/- Stdev
+#     Latency   263.96ms   96.29ms 506.39ms   69.14%
+#     Req/Sec   115.63     79.29   650.00     75.40%
+#   2307 requests in 10.00s, 315.66KB read
+# Requests/sec:    230.61
+# Transfer/sec:     31.55KB
+#
+# Results for node 2 of 21:
+# Running 10s test @ http://127.0.0.1:8000/hey
+#   2 threads and 63 connections
+#   Thread Stats   Avg      Stdev     Max   +/- Stdev
+#     Latency   282.79ms   75.00ms 500.26ms   63.32%
+#     Req/Sec   108.20     60.17   240.00     58.92%
+#   2159 requests in 10.02s, 295.42KB read
+# Requests/sec:    215.47
+# Transfer/sec:     29.48KB
+#
+# [...] similar results for remaining nodes
+
+import time
+import subprocess
+import requests
+
+import ray
+from ray import serve
+from ray.serve import BackendConfig
+from ray.serve.utils import logger
+
+from ray.util.placement_group import (placement_group, remove_placement_group)
+
+ray.shutdown()
+ray.init(address="auto")
+client = serve.start()
+
+# These numbers need to correspond with the autoscaler config file.
+# The number of remote nodes in the autoscaler should upper bound
+# these because sometimes nodes fail to update.
+num_workers = 20
+expected_num_nodes = num_workers + 1
+cpus_per_node = 4
+num_remote_cpus = expected_num_nodes * cpus_per_node
+
+# Wait until the expected number of nodes have joined the cluster.
+while True:
+    num_nodes = len(ray.nodes())
+    logger.info("Waiting for nodes {}/{}".format(num_nodes,
+                                                 expected_num_nodes))
+    if num_nodes >= expected_num_nodes:
+        break
+    time.sleep(5)
+logger.info("Nodes have all joined. There are %s resources.",
+            ray.cluster_resources())
+
+
+def hey(_):
+    time.sleep(0.01)  # Sleep for 10ms
+    return b"hey"
+
+
+num_connections = int(num_remote_cpus * 0.75)
+num_threads = 2
+time_to_run = "10s"
+
+pg = placement_group(
+    [{
+        "CPU": 1
+    } for _ in range(expected_num_nodes)], strategy="STRICT_SPREAD")
+ray.get(pg.ready())
+
+# The number of replicas is the number of cores remaining after accounting
+# for the one HTTP proxy actor on each node, the "hey" requester task on each
+# node, and the serve controller.
+# num_replicas = expected_num_nodes * (cpus_per_node - 2) - 1
+num_replicas = ray.available_resources()["CPU"]
+logger.info("Starting %i replicas", num_replicas)
+client.create_backend(
+    "hey", hey, config=BackendConfig(num_replicas=num_replicas))
+client.create_endpoint("hey", backend="hey", route="/hey")
+
+
+@ray.remote
+def run_wrk():
+    logger.info("Warming up for ~3 seconds")
+    for _ in range(5):
+        resp = requests.get("http://127.0.0.1:8000/hey").text
+        logger.info("Received response \'" + resp + "\'")
+        time.sleep(0.5)
+
+    result = subprocess.run(
+        [
+            "wrk", "-c",
+            str(num_connections), "-t",
+            str(num_threads), "-d", time_to_run, "http://127.0.0.1:8000/hey"
+        ],
+        stdout=subprocess.PIPE)
+    return result.stdout.decode()
+
+
+results = ray.get([
+    run_wrk.options(placement_group=pg,
+                    placement_group_bundle_index=i).remote()
+    for i in range(expected_num_nodes)
+])
+
+for i in range(expected_num_nodes):
+    logger.info("Results for node %i of %i:", i + 1, expected_num_nodes)
+    logger.info(results[i])
+
+remove_placement_group(pg)