Files
ray/python/ray/serve/global_state.py
T

152 lines
6.1 KiB
Python

import ray
from ray.serve.backend_config import BackendConfig
from ray.serve.constants import (SERVE_MASTER_NAME, ASYNC_CONCURRENCY)
from ray.serve.http_proxy import HTTPProxyActor
from ray.serve.kv_store_service import (BackendTable, RoutingTable,
TrafficPolicyTable)
from ray.serve.metric import (MetricMonitor, start_metric_monitor_loop)
from ray.serve.utils import get_random_letters
@ray.remote
class ServeMaster:
"""Initialize and store all actor handles.
Note:
This actor is necessary because ray will destory actors when the
original actor handle goes out of scope (when driver exit). Therefore
we need to initialize and store actor handles in a seperate actor.
"""
def __init__(self, kv_store_connector):
self.kv_store_connector = kv_store_connector
self.route_table = RoutingTable(kv_store_connector)
self.backend_table = BackendTable(kv_store_connector)
self.policy_table = TrafficPolicyTable(kv_store_connector)
self.tag_to_actor_handles = dict()
self.router = None
self.http_proxy = None
self.metric_monitor = None
def get_kv_store_connector(self):
return self.kv_store_connector
def start_router(self, router_class, init_kwargs):
assert self.router is None, "Router already started."
self.router = router_class.options(
max_concurrency=ASYNC_CONCURRENCY).remote(**init_kwargs)
def get_router(self):
assert self.router is not None, "Router not started yet."
return [self.router]
def start_http_proxy(self, host, port):
assert self.http_proxy is None, "HTTP proxy already started."
assert self.router is not None, (
"Router must be started before HTTP proxy.")
self.http_proxy = HTTPProxyActor.options(
max_concurrency=ASYNC_CONCURRENCY).remote()
self.http_proxy.run.remote(host, port)
ray.get(self.http_proxy.set_router_handle.remote(self.router))
def get_http_proxy(self):
assert self.http_proxy is not None, "HTTP proxy not started yet."
return [self.http_proxy]
def start_metric_monitor(self, gc_window_seconds):
assert self.metric_monitor is None, "Metric monitor already started."
self.metric_monitor = MetricMonitor.remote(gc_window_seconds)
# TODO(edoakes): this should be an actor method, not a separate task.
start_metric_monitor_loop.remote(self.metric_monitor)
self.metric_monitor.add_target.remote(self.router)
def get_metric_monitor(self):
assert self.metric_monitor is not None, (
"Metric monitor not started yet.")
return [self.metric_monitor]
def start_backend_replica(self, backend_tag):
assert (backend_tag in self.backend_table.list_backends()
), "Backend {} is not registered.".format(backend_tag)
replica_tag = "{}#{}".format(backend_tag, get_random_letters(length=6))
# Fetch the info to start the replica from the backend table.
creator = self.backend_table.get_backend_creator(backend_tag)
backend_config_dict = self.backend_table.get_info(backend_tag)
backend_config = BackendConfig(**backend_config_dict)
init_args = self.backend_table.get_init_args(backend_tag)
kwargs = backend_config.get_actor_creation_args(init_args)
runner_handle = creator(kwargs)
self.tag_to_actor_handles[replica_tag] = runner_handle
# Set up the worker.
ray.get(
runner_handle._ray_serve_setup.remote(backend_tag,
self.get_router()[0],
runner_handle))
ray.get(runner_handle._ray_serve_fetch.remote())
# Register the worker in config tables and metric monitor.
self.backend_table.add_replica(backend_tag, replica_tag)
self.get_metric_monitor()[0].add_target.remote(runner_handle)
def remove_backend_replica(self, backend_tag):
assert (backend_tag in self.backend_table.list_backends()
), "Backend {} is not registered.".format(backend_tag)
assert (
len(self.backend_table.list_replicas(backend_tag)) > 0
), "Backend {} does not have enough replicas to be removed.".format(
backend_tag)
replica_tag = self.backend_table.remove_replica(backend_tag)
assert replica_tag in self.tag_to_actor_handles
replica_handle = self.tag_to_actor_handles.pop(replica_tag)
# Remove the replica from metric monitor.
[monitor] = self.get_metric_monitor()
ray.get(monitor.remove_target.remote(replica_handle))
# Remove the replica from router.
# This will also destroy the actor handle.
[router] = self.get_router()
ray.get(
router.remove_and_destory_replica.remote(backend_tag,
replica_handle))
def get_all_handles(self):
return self.tag_to_actor_handles
class GlobalState:
"""Encapsulate all global state in the serving system.
The information is fetch lazily from
1. A collection of namespaced key value stores
2. A actor supervisor service
"""
def __init__(self, master_actor=None):
# Get actor nursery handle.
if master_actor is None:
master_actor = ray.util.get_actor(SERVE_MASTER_NAME)
self.master_actor = master_actor
# Connect to all the tables.
kv_store_connector = ray.get(
self.master_actor.get_kv_store_connector.remote())
self.route_table = RoutingTable(kv_store_connector)
self.backend_table = BackendTable(kv_store_connector)
self.policy_table = TrafficPolicyTable(kv_store_connector)
def get_http_proxy(self):
return ray.get(self.master_actor.get_http_proxy.remote())[0]
def get_router(self):
return ray.get(self.master_actor.get_router.remote())[0]
def get_metric_monitor(self):
return ray.get(self.master_actor.get_metric_monitor.remote())[0]