[serve] Create all other actors in master actor (#7791)

This commit is contained in:
Edward Oakes
2020-04-01 10:15:04 -05:00
committed by GitHub
parent b011c604d7
commit f4239d27fa
5 changed files with 144 additions and 218 deletions
+29 -81
View File
@@ -9,10 +9,10 @@ import numpy as np
import ray
from ray.serve.constants import (DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT,
SERVE_MASTER_NAME)
from ray.serve.global_state import GlobalState, start_initial_state
from ray.serve.global_state import GlobalState, ServeMaster
from ray.serve.kv_store_service import SQLiteKVStore
from ray.serve.task_runner import RayServeMixin, TaskRunnerActor
from ray.serve.utils import block_until_http_ready, get_random_letters, expand
from ray.serve.utils import block_until_http_ready, expand
from ray.serve.exceptions import RayServeException, batch_annotation_not_found
from ray.serve.backend_config import BackendConfig
from ray.serve.policy import RoutePolicy
@@ -138,16 +138,15 @@ def init(
def kv_store_connector(namespace):
return SQLiteKVStore(namespace, db_path=kv_store_path)
master = start_initial_state(kv_store_connector)
master = ServeMaster.options(
detached=True, name=SERVE_MASTER_NAME).remote(kv_store_connector)
ray.get(master.start_router.remote(queueing_policy.value, policy_kwargs))
global_state = GlobalState(master)
router = global_state.init_or_get_router(
queueing_policy=queueing_policy, policy_kwargs=policy_kwargs)
global_state.init_or_get_metric_monitor(
gc_window_seconds=gc_window_seconds)
ray.get(master.start_metric_monitor.remote(gc_window_seconds))
if start_server:
global_state.init_or_get_http_proxy(
host=http_host, port=http_port).set_router_handle.remote(router)
ray.get(master.start_http_proxy.remote(http_host, http_port))
if start_server and blocking:
block_until_http_ready("http://{}:{}/-/routes".format(
@@ -169,9 +168,11 @@ def create_endpoint(endpoint_name, route=None, methods=["GET"]):
methods = [m.upper() for m in methods]
global_state.route_table.register_service(
route, endpoint_name, methods=methods)
ray.get(global_state.init_or_get_http_proxy().set_route_table.remote(
global_state.route_table.list_service(
include_methods=True, include_headless=False)))
http_proxy = global_state.get_http_proxy()
ray.get(
http_proxy.set_route_table.remote(
global_state.route_table.list_service(
include_methods=True, include_headless=False)))
@_ensure_connected
@@ -198,8 +199,8 @@ def set_backend_config(backend_tag, backend_config):
# inform the router about change in configuration
# particularly for setting max_batch_size
ray.get(global_state.init_or_get_router().set_backend_config.remote(
backend_tag, backend_config_dict))
router = global_state.get_router()
ray.get(router.set_backend_config.remote(backend_tag, backend_config_dict))
# checking if replicas need to be restarted
# Replicas are restarted if there is any change in the backend config
@@ -281,7 +282,6 @@ def create_backend(func_or_class,
class CustomActor(RayServeMixin, func_or_class):
@wraps(func_or_class.__init__)
def __init__(self, *args, **kwargs):
init() # serve init
super().__init__(*args, **kwargs)
arg_list = actor_init_args
@@ -305,68 +305,11 @@ def create_backend(func_or_class,
# set the backend config inside the router
# particularly for max-batch-size
ray.get(global_state.init_or_get_router().set_backend_config.remote(
backend_tag, backend_config_dict))
router = global_state.get_router()
ray.get(router.set_backend_config.remote(backend_tag, backend_config_dict))
_scale(backend_tag, backend_config_dict["num_replicas"])
def _start_replica(backend_tag):
assert (backend_tag in global_state.backend_table.list_backends()
), "Backend {} is not registered.".format(backend_tag)
replica_tag = "{}#{}".format(backend_tag, get_random_letters(length=6))
# get the info which starts the replicas
creator = global_state.backend_table.get_backend_creator(backend_tag)
backend_config_dict = global_state.backend_table.get_info(backend_tag)
backend_config = BackendConfig(**backend_config_dict)
init_args = global_state.backend_table.get_init_args(backend_tag)
# get actor creation kwargs
actor_kwargs = backend_config.get_actor_creation_args(init_args)
# Create the runner in the master actor
[runner_handle] = ray.get(
global_state.master_actor_handle.start_actor_with_creator.remote(
creator, actor_kwargs, replica_tag))
# Setup the worker
ray.get(
runner_handle._ray_serve_setup.remote(
backend_tag, global_state.init_or_get_router(), runner_handle))
runner_handle._ray_serve_fetch.remote()
# Register the worker in config tables as well as metric monitor
global_state.backend_table.add_replica(backend_tag, replica_tag)
global_state.init_or_get_metric_monitor().add_target.remote(runner_handle)
def _remove_replica(backend_tag):
assert (backend_tag in global_state.backend_table.list_backends()
), "Backend {} is not registered.".format(backend_tag)
assert (
len(global_state.backend_table.list_replicas(backend_tag)) >
0), "Backend {} does not have enough replicas to be removed.".format(
backend_tag)
replica_tag = global_state.backend_table.remove_replica(backend_tag)
[replica_handle] = ray.get(
global_state.master_actor_handle.get_handle.remote(replica_tag))
# Remove the replica from metric monitor.
ray.get(global_state.init_or_get_metric_monitor().remove_target.remote(
replica_handle))
# Remove the replica from master actor.
ray.get(global_state.master_actor_handle.remove_handle.remote(replica_tag))
# Remove the replica from router.
# This will also destory the actor handle.
ray.get(
global_state.init_or_get_router().remove_and_destory_replica.remote(
backend_tag, replica_handle))
@_ensure_connected
def _scale(backend_tag, num_replicas):
"""Set the number of replicas for backend_tag.
@@ -386,10 +329,14 @@ def _scale(backend_tag, num_replicas):
if delta_num_replicas > 0:
for _ in range(delta_num_replicas):
_start_replica(backend_tag)
ray.get(
global_state.master_actor.start_backend_replica.remote(
backend_tag))
elif delta_num_replicas < 0:
for _ in range(-delta_num_replicas):
_remove_replica(backend_tag)
ray.get(
global_state.master_actor.remove_backend_replica.remote(
backend_tag))
@_ensure_connected
@@ -441,8 +388,9 @@ def split(endpoint_name, traffic_policy_dictionary):
global_state.policy_table.register_traffic_policy(
endpoint_name, traffic_policy_dictionary)
ray.get(global_state.init_or_get_router().set_traffic.remote(
endpoint_name, traffic_policy_dictionary))
router = global_state.get_router()
ray.get(
router.set_traffic.remote(endpoint_name, traffic_policy_dictionary))
@_ensure_connected
@@ -473,7 +421,7 @@ def get_handle(endpoint_name,
from ray.serve.handle import RayServeHandle
return RayServeHandle(
global_state.init_or_get_router(),
global_state.get_router(),
endpoint_name,
relative_slo_ms,
absolute_slo_ms,
@@ -492,8 +440,8 @@ def stat(percentiles=[50, 90, 95],
The longest aggregation window must be shorter or equal to the
gc_window_seconds.
"""
return ray.get(global_state.init_or_get_metric_monitor().collect.remote(
percentiles, agg_windows_seconds))
monitor = global_state.get_metric_monitor()
return ray.get(monitor.collect.remote(percentiles, agg_windows_seconds))
class route:
-3
View File
@@ -1,9 +1,6 @@
#: Actor name used to register master actor
SERVE_MASTER_NAME = "SERVE_MASTER_ACTOR"
#: KVStore connector key in bootstrap config
BOOTSTRAP_KV_STORE_CONN_KEY = "kv_store_connector"
#: HTTP Address
DEFAULT_HTTP_ADDRESS = "http://127.0.0.1:8000"
+110 -129
View File
@@ -1,23 +1,11 @@
import ray
from ray.serve.constants import (BOOTSTRAP_KV_STORE_CONN_KEY,
DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT,
SERVE_MASTER_NAME, ASYNC_CONCURRENCY)
from ray.serve.backend_config import BackendConfig
from ray.serve.constants import (SERVE_MASTER_NAME, ASYNC_CONCURRENCY)
from ray.serve.http_proxy import HTTPProxyActor
from ray.serve.kv_store_service import (BackendTable, RoutingTable,
TrafficPolicyTable)
from ray.serve.metric import (MetricMonitor, start_metric_monitor_loop)
from ray.serve.policy import RoutePolicy
from ray.serve.http_proxy import HTTPProxyActor
def start_initial_state(kv_store_connector):
master_handle = ServeMaster.remote()
ray.util.register_actor(SERVE_MASTER_NAME, master_handle)
ray.get(
master_handle.store_bootstrap_state.remote(BOOTSTRAP_KV_STORE_CONN_KEY,
kv_store_connector))
return master_handle
from ray.serve.utils import get_random_letters
@ray.remote
@@ -30,55 +18,107 @@ class ServeMaster:
we need to initialize and store actor handles in a seperate actor.
"""
def __init__(self):
def __init__(self, kv_store_connector):
self.kv_store_connector = kv_store_connector
self.route_table = RoutingTable(kv_store_connector)
self.backend_table = BackendTable(kv_store_connector)
self.policy_table = TrafficPolicyTable(kv_store_connector)
self.tag_to_actor_handles = dict()
self.bootstrap_state = dict()
self.router = None
self.http_proxy = None
self.metric_monitor = None
def start_actor(self,
actor_cls,
tag,
init_args=(),
init_kwargs={},
is_asyncio=False):
"""Start an actor and add it to the nursery"""
# Avoid double initialization
if tag in self.tag_to_actor_handles.keys():
return [self.tag_to_actor_handles[tag]]
def get_kv_store_connector(self):
return self.kv_store_connector
max_concurrency = ASYNC_CONCURRENCY if is_asyncio else None
handle = (actor_cls.options(max_concurrency=max_concurrency).remote(
*init_args, **init_kwargs))
self.tag_to_actor_handles[tag] = handle
return [handle]
def start_router(self, router_class, init_kwargs):
assert self.router is None, "Router already started."
self.router = router_class.options(
max_concurrency=ASYNC_CONCURRENCY).remote(**init_kwargs)
def start_actor_with_creator(self, creator, kwargs, tag):
"""
Args:
creator (Callable[Dict]): a closure that should return
a newly created actor handle when called with kwargs.
The kwargs input is passed to `ActorCls_remote` method.
"""
handle = creator(kwargs)
self.tag_to_actor_handles[tag] = handle
return [handle]
def get_router(self):
assert self.router is not None, "Router not started yet."
return [self.router]
def start_http_proxy(self, host, port):
assert self.http_proxy is None, "HTTP proxy already started."
assert self.router is not None, (
"Router must be started before HTTP proxy.")
self.http_proxy = HTTPProxyActor.options(
max_concurrency=ASYNC_CONCURRENCY).remote()
self.http_proxy.run.remote(host, port)
ray.get(self.http_proxy.set_router_handle.remote(self.router))
def get_http_proxy(self):
assert self.http_proxy is not None, "HTTP proxy not started yet."
return [self.http_proxy]
def start_metric_monitor(self, gc_window_seconds):
assert self.metric_monitor is None, "Metric monitor already started."
self.metric_monitor = MetricMonitor.remote(gc_window_seconds)
# TODO(edoakes): this should be an actor method, not a separate task.
start_metric_monitor_loop.remote(self.metric_monitor)
self.metric_monitor.add_target.remote(self.router)
def get_metric_monitor(self):
assert self.metric_monitor is not None, (
"Metric monitor not started yet.")
return [self.metric_monitor]
def start_backend_replica(self, backend_tag):
assert (backend_tag in self.backend_table.list_backends()
), "Backend {} is not registered.".format(backend_tag)
replica_tag = "{}#{}".format(backend_tag, get_random_letters(length=6))
# Fetch the info to start the replica from the backend table.
creator = self.backend_table.get_backend_creator(backend_tag)
backend_config_dict = self.backend_table.get_info(backend_tag)
backend_config = BackendConfig(**backend_config_dict)
init_args = self.backend_table.get_init_args(backend_tag)
kwargs = backend_config.get_actor_creation_args(init_args)
runner_handle = creator(kwargs)
self.tag_to_actor_handles[replica_tag] = runner_handle
# Set up the worker.
ray.get(
runner_handle._ray_serve_setup.remote(backend_tag,
self.get_router()[0],
runner_handle))
ray.get(runner_handle._ray_serve_fetch.remote())
# Register the worker in config tables and metric monitor.
self.backend_table.add_replica(backend_tag, replica_tag)
self.get_metric_monitor()[0].add_target.remote(runner_handle)
def remove_backend_replica(self, backend_tag):
assert (backend_tag in self.backend_table.list_backends()
), "Backend {} is not registered.".format(backend_tag)
assert (
len(self.backend_table.list_replicas(backend_tag)) > 0
), "Backend {} does not have enough replicas to be removed.".format(
backend_tag)
replica_tag = self.backend_table.remove_replica(backend_tag)
assert replica_tag in self.tag_to_actor_handles
replica_handle = self.tag_to_actor_handles.pop(replica_tag)
# Remove the replica from metric monitor.
[monitor] = self.get_metric_monitor()
ray.get(monitor.remove_target.remote(replica_handle))
# Remove the replica from router.
# This will also destroy the actor handle.
[router] = self.get_router()
ray.get(
router.remove_and_destory_replica.remote(backend_tag,
replica_handle))
def get_all_handles(self):
return self.tag_to_actor_handles
def get_handle(self, actor_tag):
return [self.tag_to_actor_handles[actor_tag]]
def remove_handle(self, actor_tag):
if actor_tag in self.tag_to_actor_handles.keys():
self.tag_to_actor_handles.pop(actor_tag)
def store_bootstrap_state(self, key, value):
self.bootstrap_state[key] = value
def get_bootstrap_state_dict(self):
return self.bootstrap_state
class GlobalState:
"""Encapsulate all global state in the serving system.
@@ -88,83 +128,24 @@ class GlobalState:
2. A actor supervisor service
"""
def __init__(self, master_actor_handle=None):
# Get actor nursery handle
if master_actor_handle is None:
master_actor_handle = ray.util.get_actor(SERVE_MASTER_NAME)
self.master_actor_handle = master_actor_handle
def __init__(self, master_actor=None):
# Get actor nursery handle.
if master_actor is None:
master_actor = ray.util.get_actor(SERVE_MASTER_NAME)
self.master_actor = master_actor
# Connect to all the table
bootstrap_config = ray.get(
self.master_actor_handle.get_bootstrap_state_dict.remote())
kv_store_connector = bootstrap_config[BOOTSTRAP_KV_STORE_CONN_KEY]
# Connect to all the tables.
kv_store_connector = ray.get(
self.master_actor.get_kv_store_connector.remote())
self.route_table = RoutingTable(kv_store_connector)
self.backend_table = BackendTable(kv_store_connector)
self.policy_table = TrafficPolicyTable(kv_store_connector)
self.refresh_actor_handle_cache()
def get_http_proxy(self):
return ray.get(self.master_actor.get_http_proxy.remote())[0]
def refresh_actor_handle_cache(self):
self.actor_handle_cache = ray.get(
self.master_actor_handle.get_all_handles.remote())
def get_router(self):
return ray.get(self.master_actor.get_router.remote())[0]
def init_or_get_http_proxy(self,
host=DEFAULT_HTTP_HOST,
port=DEFAULT_HTTP_PORT):
if "http_proxy" not in self.actor_handle_cache:
[handle] = ray.get(
self.master_actor_handle.start_actor.remote(
HTTPProxyActor, tag="http_proxy"))
handle.run.remote(host=host, port=port)
self.refresh_actor_handle_cache()
return self.actor_handle_cache["http_proxy"]
def _get_queueing_policy(self, default_policy):
return_policy = default_policy
# check if there is already a queue_actor running
# with policy as p.name for the case where
# serve nursery exists: ray.util.get_actor(SERVE_MASTER_NAME)
for p in RoutePolicy:
queue_actor_tag = "queue_actor::" + p.name
if queue_actor_tag in self.actor_handle_cache:
return_policy = p
break
return return_policy
def init_or_get_router(self,
queueing_policy=RoutePolicy.Random,
policy_kwargs={}):
# get queueing policy
self.queueing_policy = self._get_queueing_policy(
default_policy=queueing_policy)
queue_actor_tag = "queue_actor::" + self.queueing_policy.name
if queue_actor_tag not in self.actor_handle_cache:
[handle] = ray.get(
self.master_actor_handle.start_actor.remote(
self.queueing_policy.value,
init_kwargs=policy_kwargs,
tag=queue_actor_tag,
is_asyncio=True))
# handle.register_self_handle.remote(handle)
self.refresh_actor_handle_cache()
return self.actor_handle_cache[queue_actor_tag]
def init_or_get_metric_monitor(self, gc_window_seconds=3600):
if "metric_monitor" not in self.actor_handle_cache:
[handle] = ray.get(
self.master_actor_handle.start_actor.remote(
MetricMonitor,
init_args=(gc_window_seconds, ),
tag="metric_monitor"))
start_metric_monitor_loop.remote(handle)
if "queue_actor" in self.actor_handle_cache:
handle.add_target.remote(
self.actor_handle_cache["queue_actor"])
self.refresh_actor_handle_cache()
return self.actor_handle_cache["metric_monitor"]
def get_metric_monitor(self):
return ray.get(self.master_actor.get_metric_monitor.remote())[0]
+1 -1
View File
@@ -155,7 +155,7 @@ class HTTPProxyActor:
def __init__(self):
self.app = HTTPProxy()
async def run(self, host="0.0.0.0", port=8000):
async def run(self, host, port):
sock = socket.socket()
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.bind((host, port))
+4 -4
View File
@@ -202,8 +202,8 @@ def test_killing_replicas(serve_instance):
serve.set_backend_config("simple:v1", bnew_config)
new_replica_tag_list = global_state.backend_table.list_replicas(
"simple:v1")
global_state.refresh_actor_handle_cache()
new_all_tag_list = list(global_state.actor_handle_cache.keys())
new_all_tag_list = list(
ray.get(global_state.master_actor.get_all_handles.remote()).keys())
# the new_replica_tag_list must be subset of all_tag_list
assert set(new_replica_tag_list) <= set(new_all_tag_list)
@@ -236,8 +236,8 @@ def test_not_killing_replicas(serve_instance):
serve.set_backend_config("bsimple:v1", bnew_config)
new_replica_tag_list = global_state.backend_table.list_replicas(
"bsimple:v1")
global_state.refresh_actor_handle_cache()
new_all_tag_list = list(global_state.actor_handle_cache.keys())
new_all_tag_list = list(
ray.get(global_state.master_actor.get_all_handles.remote()).keys())
# the old and new replica tag list should be identical
# and should be subset of all_tag_list