mirror of
https://github.com/wassname/ray.git
synced 2026-07-03 05:11:18 +08:00
250 lines
7.4 KiB
Python
250 lines
7.4 KiB
Python
import os
|
|
import requests
|
|
import tempfile
|
|
import time
|
|
|
|
import ray
|
|
from ray import serve
|
|
|
|
|
|
def request_with_retries(endpoint, timeout=30):
|
|
start = time.time()
|
|
while True:
|
|
try:
|
|
return requests.get(
|
|
"http://127.0.0.1:8000" + endpoint, timeout=timeout)
|
|
except requests.RequestException:
|
|
if time.time() - start > timeout:
|
|
raise TimeoutError
|
|
time.sleep(0.1)
|
|
|
|
|
|
def test_master_failure(serve_instance):
|
|
serve.init()
|
|
|
|
def function():
|
|
return "hello1"
|
|
|
|
serve.create_backend("master_failure:v1", function)
|
|
serve.create_endpoint(
|
|
"master_failure", backend="master_failure:v1", route="/master_failure")
|
|
|
|
assert request_with_retries("/master_failure", timeout=1).text == "hello1"
|
|
|
|
for _ in range(10):
|
|
response = request_with_retries("/master_failure", timeout=30)
|
|
assert response.text == "hello1"
|
|
|
|
ray.kill(serve.api._get_master_actor(), no_restart=False)
|
|
|
|
for _ in range(10):
|
|
response = request_with_retries("/master_failure", timeout=30)
|
|
assert response.text == "hello1"
|
|
|
|
def function():
|
|
return "hello2"
|
|
|
|
ray.kill(serve.api._get_master_actor(), no_restart=False)
|
|
|
|
serve.create_backend("master_failure:v2", function)
|
|
serve.set_traffic("master_failure", {"master_failure:v2": 1.0})
|
|
|
|
for _ in range(10):
|
|
response = request_with_retries("/master_failure", timeout=30)
|
|
assert response.text == "hello2"
|
|
|
|
def function():
|
|
return "hello3"
|
|
|
|
ray.kill(serve.api._get_master_actor(), no_restart=False)
|
|
serve.create_backend("master_failure_2", function)
|
|
ray.kill(serve.api._get_master_actor(), no_restart=False)
|
|
serve.create_endpoint(
|
|
"master_failure_2",
|
|
backend="master_failure_2",
|
|
route="/master_failure_2")
|
|
ray.kill(serve.api._get_master_actor(), no_restart=False)
|
|
|
|
for _ in range(10):
|
|
response = request_with_retries("/master_failure", timeout=30)
|
|
assert response.text == "hello2"
|
|
response = request_with_retries("/master_failure_2", timeout=30)
|
|
assert response.text == "hello3"
|
|
|
|
|
|
def _kill_http_proxy():
|
|
[http_proxy] = ray.get(
|
|
serve.api._get_master_actor().get_http_proxy.remote())
|
|
ray.kill(http_proxy, no_restart=False)
|
|
|
|
|
|
def test_http_proxy_failure(serve_instance):
|
|
serve.init()
|
|
|
|
def function():
|
|
return "hello1"
|
|
|
|
serve.create_backend("proxy_failure:v1", function)
|
|
serve.create_endpoint(
|
|
"proxy_failure", backend="proxy_failure:v1", route="/proxy_failure")
|
|
|
|
assert request_with_retries("/proxy_failure", timeout=1.0).text == "hello1"
|
|
|
|
for _ in range(10):
|
|
response = request_with_retries("/proxy_failure", timeout=30)
|
|
assert response.text == "hello1"
|
|
|
|
_kill_http_proxy()
|
|
|
|
def function():
|
|
return "hello2"
|
|
|
|
serve.create_backend("proxy_failure:v2", function)
|
|
serve.set_traffic("proxy_failure", {"proxy_failure:v2": 1.0})
|
|
|
|
for _ in range(10):
|
|
response = request_with_retries("/proxy_failure", timeout=30)
|
|
assert response.text == "hello2"
|
|
|
|
|
|
def _kill_router():
|
|
[router] = ray.get(serve.api._get_master_actor().get_router.remote())
|
|
ray.kill(router, no_restart=False)
|
|
|
|
|
|
def test_router_failure(serve_instance):
|
|
serve.init()
|
|
|
|
def function():
|
|
return "hello1"
|
|
|
|
serve.create_backend("router_failure:v1", function)
|
|
serve.create_endpoint(
|
|
"router_failure", backend="router_failure:v1", route="/router_failure")
|
|
|
|
assert request_with_retries("/router_failure", timeout=5).text == "hello1"
|
|
|
|
for _ in range(10):
|
|
response = request_with_retries("/router_failure", timeout=30)
|
|
assert response.text == "hello1"
|
|
|
|
_kill_router()
|
|
|
|
for _ in range(10):
|
|
response = request_with_retries("/router_failure", timeout=30)
|
|
assert response.text == "hello1"
|
|
|
|
def function():
|
|
return "hello2"
|
|
|
|
serve.create_backend("router_failure:v2", function)
|
|
serve.set_traffic("router_failure", {"router_failure:v2": 1.0})
|
|
|
|
for _ in range(10):
|
|
response = request_with_retries("/router_failure", timeout=30)
|
|
assert response.text == "hello2"
|
|
|
|
|
|
def _get_worker_handles(backend):
|
|
master_actor = serve.api._get_master_actor()
|
|
backend_dict = ray.get(master_actor.get_all_worker_handles.remote())
|
|
|
|
return list(backend_dict[backend].values())
|
|
|
|
|
|
# Test that a worker dying unexpectedly causes it to restart and continue
|
|
# serving requests.
|
|
def test_worker_restart(serve_instance):
|
|
serve.init()
|
|
|
|
class Worker1:
|
|
def __call__(self):
|
|
return os.getpid()
|
|
|
|
serve.create_backend("worker_failure:v1", Worker1)
|
|
serve.create_endpoint(
|
|
"worker_failure", backend="worker_failure:v1", route="/worker_failure")
|
|
|
|
# Get the PID of the worker.
|
|
old_pid = request_with_retries("/worker_failure", timeout=1).text
|
|
|
|
# Kill the worker.
|
|
handles = _get_worker_handles("worker_failure:v1")
|
|
assert len(handles) == 1
|
|
ray.kill(handles[0], no_restart=False)
|
|
|
|
# Wait until the worker is killed and a one is started.
|
|
start = time.time()
|
|
while time.time() - start < 30:
|
|
response = request_with_retries("/worker_failure", timeout=30)
|
|
if response.text != old_pid:
|
|
break
|
|
else:
|
|
assert False, "Timed out waiting for worker to die."
|
|
|
|
|
|
# Test that if there are multiple replicas for a worker and one dies
|
|
# unexpectedly, the others continue to serve requests.
|
|
def test_worker_replica_failure(serve_instance):
|
|
serve.http_proxy.MAX_ACTOR_DEAD_RETRIES = 0
|
|
serve.init()
|
|
|
|
class Worker:
|
|
# Assumes that two replicas are started. Will hang forever in the
|
|
# constructor for any workers that are restarted.
|
|
def __init__(self, path):
|
|
self.should_hang = False
|
|
if not os.path.exists(path):
|
|
with open(path, "w") as f:
|
|
f.write("1")
|
|
else:
|
|
with open(path, "r") as f:
|
|
num = int(f.read())
|
|
|
|
with open(path, "w") as f:
|
|
if num == 2:
|
|
self.should_hang = True
|
|
else:
|
|
f.write(str(num + 1))
|
|
|
|
if self.should_hang:
|
|
while True:
|
|
pass
|
|
|
|
def __call__(self):
|
|
pass
|
|
|
|
temp_path = tempfile.gettempdir() + "/" + serve.utils.get_random_letters()
|
|
serve.create_backend("replica_failure", Worker, temp_path)
|
|
serve.update_backend_config("replica_failure", {"num_replicas": 2})
|
|
serve.create_endpoint(
|
|
"replica_failure", backend="replica_failure", route="/replica_failure")
|
|
|
|
# Wait until both replicas have been started.
|
|
responses = set()
|
|
while len(responses) == 1:
|
|
responses.add(request_with_retries("/replica_failure", timeout=1).text)
|
|
time.sleep(0.1)
|
|
|
|
# Kill one of the replicas.
|
|
handles = _get_worker_handles("replica_failure")
|
|
assert len(handles) == 2
|
|
ray.kill(handles[0], no_restart=False)
|
|
|
|
# Check that the other replica still serves requests.
|
|
for _ in range(10):
|
|
while True:
|
|
try:
|
|
# The timeout needs to be small here because the request to
|
|
# the restarting worker will hang.
|
|
request_with_retries("/replica_failure", timeout=0.1)
|
|
break
|
|
except TimeoutError:
|
|
time.sleep(0.1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
import pytest
|
|
sys.exit(pytest.main(["-v", "-s", __file__]))
|