mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 05:43:03 +08:00
227 lines
9.5 KiB
Python
227 lines
9.5 KiB
Python
import inspect
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import pydantic
|
|
from pydantic import BaseModel, PositiveFloat, PositiveInt, validator
|
|
from ray.serve.constants import (ASYNC_CONCURRENCY, DEFAULT_HTTP_HOST,
|
|
DEFAULT_HTTP_PORT)
|
|
|
|
|
|
def _callable_accepts_batch(func_or_class):
|
|
if inspect.isfunction(func_or_class):
|
|
return hasattr(func_or_class, "_serve_accept_batch")
|
|
elif inspect.isclass(func_or_class):
|
|
return hasattr(func_or_class.__call__, "_serve_accept_batch")
|
|
|
|
|
|
def _callable_is_blocking(func_or_class):
|
|
if inspect.isfunction(func_or_class):
|
|
return not inspect.iscoroutinefunction(func_or_class)
|
|
elif inspect.isclass(func_or_class):
|
|
return not inspect.iscoroutinefunction(func_or_class.__call__)
|
|
|
|
|
|
@dataclass
|
|
class BackendMetadata:
|
|
accepts_batches: bool = False
|
|
is_blocking: bool = True
|
|
autoscaling_config: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
class BackendConfig(BaseModel):
|
|
"""Configuration options for a backend, to be set by the user.
|
|
|
|
Args:
|
|
num_replicas (Optional[int]): The number of processes to start up that
|
|
will handle requests to this backend. Defaults to 0.
|
|
max_batch_size (Optional[int]): The maximum number of requests that
|
|
will be processed in one batch by this backend. Defaults to None
|
|
(no maximium).
|
|
batch_wait_timeout (Optional[float]): The time in seconds that backend
|
|
replicas will wait for a full batch of requests before processing a
|
|
partial batch. Defaults to 0.
|
|
max_concurrent_queries (Optional[int]): The maximum number of queries
|
|
that will be sent to a replica of this backend without receiving a
|
|
response. Defaults to None (no maximum).
|
|
user_config (Optional[Any]): Arguments to pass to the reconfigure
|
|
method of the backend. The reconfigure method is called if
|
|
user_config is not None.
|
|
experimental_graceful_shutdown_wait_loop_s (Optional[float]): Duration
|
|
that backend workers will wait until there is no more work to be
|
|
done before shutting down. Defaults to 2s.
|
|
experimental_graceful_shutdown_timeout_s (Optional[float]):
|
|
Controller waits for this duration to forcefully kill the replica
|
|
for shutdown. Defaults to 20s.
|
|
"""
|
|
|
|
internal_metadata: BackendMetadata = BackendMetadata()
|
|
num_replicas: PositiveInt = 1
|
|
max_batch_size: Optional[PositiveInt] = None
|
|
batch_wait_timeout: float = 0
|
|
max_concurrent_queries: Optional[int] = None
|
|
user_config: Any = None
|
|
|
|
experimental_graceful_shutdown_wait_loop_s: PositiveFloat = 2.0
|
|
experimental_graceful_shutdown_timeout_s: PositiveFloat = 20.0
|
|
|
|
class Config:
|
|
validate_assignment = True
|
|
extra = "forbid"
|
|
arbitrary_types_allowed = True
|
|
|
|
def _validate_batch_size(self):
|
|
if (self.max_batch_size is not None
|
|
and not self.internal_metadata.accepts_batches
|
|
and self.max_batch_size > 1):
|
|
raise ValueError(
|
|
"max_batch_size is set in config but the function or "
|
|
"method does not accept batching. Please use "
|
|
"@serve.accept_batch to explicitly mark that the function or "
|
|
"method accepts a list of requests as an argument.")
|
|
|
|
# This is not a pydantic validator, so that we may skip this method when
|
|
# creating partially filled BackendConfig objects to pass as updates--for
|
|
# example, BackendConfig(max_batch_size=5).
|
|
def _validate_complete(self):
|
|
self._validate_batch_size()
|
|
|
|
# Dynamic default for max_concurrent_queries
|
|
@validator("max_concurrent_queries", always=True)
|
|
def set_max_queries_by_mode(cls, v, values): # noqa 805
|
|
if v is None:
|
|
# Model serving mode: if the servable is blocking and the wait
|
|
# timeout is default zero seconds, then we keep the existing
|
|
# behavior to allow at most max batch size queries.
|
|
if (values["internal_metadata"].is_blocking
|
|
and values["batch_wait_timeout"] == 0):
|
|
if ("max_batch_size" in values
|
|
and values["max_batch_size"] is not None):
|
|
v = 2 * values["max_batch_size"]
|
|
else:
|
|
v = 8
|
|
|
|
# Pipeline/async mode: if the servable is not blocking,
|
|
# router should just keep pushing queries to the replicas
|
|
# until a high limit.
|
|
if not values["internal_metadata"].is_blocking:
|
|
v = ASYNC_CONCURRENCY
|
|
|
|
# Batch inference mode: user specifies non zero timeout to wait for
|
|
# full batch. We will use 2*max_batch_size to perform double
|
|
# buffering to keep the replica busy.
|
|
if ("max_batch_size" in values
|
|
and values["max_batch_size"] is not None
|
|
and values["batch_wait_timeout"] > 0):
|
|
v = 2 * values["max_batch_size"]
|
|
return v
|
|
|
|
|
|
class ReplicaConfig:
|
|
def __init__(self, func_or_class, *actor_init_args,
|
|
ray_actor_options=None):
|
|
self.func_or_class = func_or_class
|
|
self.accepts_batches = _callable_accepts_batch(func_or_class)
|
|
self.is_blocking = _callable_is_blocking(func_or_class)
|
|
self.actor_init_args = list(actor_init_args)
|
|
if ray_actor_options is None:
|
|
self.ray_actor_options = {}
|
|
else:
|
|
self.ray_actor_options = ray_actor_options
|
|
|
|
self.resource_dict = {}
|
|
self._validate()
|
|
|
|
def _validate(self):
|
|
# Validate that func_or_class is a function or class.
|
|
if inspect.isfunction(self.func_or_class):
|
|
if len(self.actor_init_args) != 0:
|
|
raise ValueError(
|
|
"actor_init_args not supported for function backend.")
|
|
elif not inspect.isclass(self.func_or_class):
|
|
raise TypeError(
|
|
"Backend must be a function or class, it is {}.".format(
|
|
type(self.func_or_class)))
|
|
|
|
if not isinstance(self.ray_actor_options, dict):
|
|
raise TypeError("ray_actor_options must be a dictionary.")
|
|
elif "lifetime" in self.ray_actor_options:
|
|
raise ValueError(
|
|
"Specifying lifetime in actor_init_args is not allowed.")
|
|
elif "name" in self.ray_actor_options:
|
|
raise ValueError(
|
|
"Specifying name in actor_init_args is not allowed.")
|
|
elif "max_restarts" in self.ray_actor_options:
|
|
raise ValueError("Specifying max_restarts in "
|
|
"actor_init_args is not allowed.")
|
|
else:
|
|
# Ray defaults to zero CPUs for placement, we default to one here.
|
|
if "num_cpus" not in self.ray_actor_options:
|
|
self.ray_actor_options["num_cpus"] = 1
|
|
num_cpus = self.ray_actor_options["num_cpus"]
|
|
if not isinstance(num_cpus, (int, float)):
|
|
raise TypeError(
|
|
"num_cpus in ray_actor_options must be an int or a float.")
|
|
elif num_cpus < 0:
|
|
raise ValueError("num_cpus in ray_actor_options must be >= 0.")
|
|
self.resource_dict["CPU"] = num_cpus
|
|
|
|
num_gpus = self.ray_actor_options.get("num_gpus", 0)
|
|
if not isinstance(num_gpus, (int, float)):
|
|
raise TypeError(
|
|
"num_gpus in ray_actor_options must be an int or a float.")
|
|
elif num_gpus < 0:
|
|
raise ValueError("num_gpus in ray_actor_options must be >= 0.")
|
|
self.resource_dict["GPU"] = num_gpus
|
|
|
|
memory = self.ray_actor_options.get("memory", 0)
|
|
if not isinstance(memory, (int, float)):
|
|
raise TypeError(
|
|
"memory in ray_actor_options must be an int or a float.")
|
|
elif memory < 0:
|
|
raise ValueError("num_gpus in ray_actor_options must be >= 0.")
|
|
self.resource_dict["memory"] = memory
|
|
|
|
object_store_memory = self.ray_actor_options.get(
|
|
"object_store_memory", 0)
|
|
if not isinstance(object_store_memory, (int, float)):
|
|
raise TypeError(
|
|
"object_store_memory in ray_actor_options must be "
|
|
"an int or a float.")
|
|
elif object_store_memory < 0:
|
|
raise ValueError(
|
|
"object_store_memory in ray_actor_options must be >= 0.")
|
|
self.resource_dict["object_store_memory"] = object_store_memory
|
|
|
|
custom_resources = self.ray_actor_options.get("resources", {})
|
|
if not isinstance(custom_resources, dict):
|
|
raise TypeError(
|
|
"resources in ray_actor_options must be a dictionary.")
|
|
self.resource_dict.update(custom_resources)
|
|
|
|
|
|
class DeploymentMode(str, Enum):
|
|
NoServer = "NoServer"
|
|
HeadOnly = "HeadOnly"
|
|
EveryNode = "EveryNode"
|
|
|
|
|
|
class HTTPOptions(pydantic.BaseModel):
|
|
# Documentation inside serve.start for user's convenience.
|
|
host: Optional[str] = DEFAULT_HTTP_HOST
|
|
port: int = DEFAULT_HTTP_PORT
|
|
middlewares: List[Any] = []
|
|
location: Optional[DeploymentMode] = DeploymentMode.HeadOnly
|
|
|
|
@validator("location", always=True)
|
|
def location_backfill_no_server(cls, v, values):
|
|
if values["host"] is None or v is None:
|
|
return DeploymentMode.NoServer
|
|
return v
|
|
|
|
class Config:
|
|
validate_assignment = True
|
|
extra = "forbid"
|
|
arbitrary_types_allowed = True
|