mirror of
https://github.com/wassname/ray.git
synced 2026-07-03 13:11:03 +08:00
[Metric] custom metrics refinement (#10861)
* In progress * In Progress. * Addressed code review. * Add unit tests. * Add a simple doc. * Fixed test failure. * Fix all test failures from serve. * Addressed code review.
This commit is contained in:
@@ -12,7 +12,7 @@ from ray.async_compat import sync_to_async
|
||||
from ray.serve.utils import (parse_request_item, _get_logger, chain_future,
|
||||
unpack_future)
|
||||
from ray.serve.exceptions import RayServeException
|
||||
from ray.experimental import metrics
|
||||
from ray.util import metrics
|
||||
from ray.serve.config import BackendConfig
|
||||
from ray.serve.router import Query
|
||||
from ray.serve.constants import DEFAULT_LATENCY_BUCKET_MS
|
||||
@@ -159,43 +159,72 @@ class RayServeWorker:
|
||||
self.num_ongoing_requests = 0
|
||||
|
||||
self.request_counter = metrics.Count(
|
||||
"backend_request_counter", ("Number of queries that have been "
|
||||
"processed in this replica"),
|
||||
"requests", ["backend"])
|
||||
self.error_counter = metrics.Count("backend_error_counter",
|
||||
("Number of exceptions that have "
|
||||
"occurred in the backend"),
|
||||
"errors", ["backend"])
|
||||
"backend_request_counter",
|
||||
description=("Number of queries that have been "
|
||||
"processed in this replica"),
|
||||
tag_keys=("backend", ))
|
||||
self.request_counter.set_default_tags({"backend": self.backend_tag})
|
||||
|
||||
self.error_counter = metrics.Count(
|
||||
"backend_error_counter",
|
||||
description=("Number of exceptions that have "
|
||||
"occurred in the backend"),
|
||||
tag_keys=("backend", ))
|
||||
self.error_counter.set_default_tags({"backend": self.backend_tag})
|
||||
|
||||
self.restart_counter = metrics.Count(
|
||||
"backend_worker_starts",
|
||||
("The number of time this replica workers "
|
||||
"has been restarted due to failure."), "restarts",
|
||||
["backend", "replica_tag"])
|
||||
|
||||
self.queuing_latency_tracker = metrics.Histogram(
|
||||
"backend_queuing_latency_ms",
|
||||
("The latency for queries waiting in the replica's queue "
|
||||
"waiting to be processed or batched."), "ms",
|
||||
DEFAULT_LATENCY_BUCKET_MS, ["backend", "replica_tag"])
|
||||
self.processing_latency_tracker = metrics.Histogram(
|
||||
"backend_processing_latency_ms",
|
||||
"The latency for queries to be processed", "ms",
|
||||
DEFAULT_LATENCY_BUCKET_MS,
|
||||
["backend", "replica_tag", "batch_size"])
|
||||
self.num_queued_items = metrics.Gauge(
|
||||
"replica_queued_queries",
|
||||
"Current number of queries queued in the the backend replicas",
|
||||
"requests", ["backend", "replica_tag"])
|
||||
self.num_processing_items = metrics.Gauge(
|
||||
"replica_processing_queries",
|
||||
"Current number of queries being processed", "requests",
|
||||
["backend", "replica_tag"])
|
||||
|
||||
self.restart_counter.record(1, {
|
||||
description=("The number of time this replica workers "
|
||||
"has been restarted due to failure."),
|
||||
tag_keys=("backend", "replica_tag"))
|
||||
self.restart_counter.set_default_tags({
|
||||
"backend": self.backend_tag,
|
||||
"replica_tag": self.replica_tag
|
||||
})
|
||||
|
||||
self.queuing_latency_tracker = metrics.Histogram(
|
||||
"backend_queuing_latency_ms",
|
||||
description=(
|
||||
"The latency for queries waiting in the replica's queue "
|
||||
"waiting to be processed or batched."),
|
||||
boundaries=DEFAULT_LATENCY_BUCKET_MS,
|
||||
tag_keys=("backend", "replica_tag"))
|
||||
self.queuing_latency_tracker.set_default_tags({
|
||||
"backend": self.backend_tag,
|
||||
"replica_tag": self.replica_tag
|
||||
})
|
||||
|
||||
self.processing_latency_tracker = metrics.Histogram(
|
||||
"backend_processing_latency_ms",
|
||||
description="The latency for queries to be processed",
|
||||
boundaries=DEFAULT_LATENCY_BUCKET_MS,
|
||||
tag_keys=("backend", "replica_tag", "batch_size"))
|
||||
self.processing_latency_tracker.set_default_tags({
|
||||
"backend": self.backend_tag,
|
||||
"replica_tag": self.replica_tag
|
||||
})
|
||||
|
||||
self.num_queued_items = metrics.Gauge(
|
||||
"replica_queued_queries",
|
||||
description=("Current number of queries queued in the "
|
||||
"the backend replicas"),
|
||||
tag_keys=("backend", "replica_tag"))
|
||||
self.num_queued_items.set_default_tags({
|
||||
"backend": self.backend_tag,
|
||||
"replica_tag": self.replica_tag
|
||||
})
|
||||
|
||||
self.num_processing_items = metrics.Gauge(
|
||||
"replica_processing_queries",
|
||||
description="Current number of queries being processed",
|
||||
tag_keys=("backend", "replica_tag"))
|
||||
self.num_processing_items.set_default_tags({
|
||||
"backend": self.backend_tag,
|
||||
"replica_tag": self.replica_tag
|
||||
})
|
||||
|
||||
self.restart_counter.record(1)
|
||||
|
||||
asyncio.get_event_loop().create_task(self.main_loop())
|
||||
|
||||
def get_runner_method(self, request_item: Query) -> Callable:
|
||||
@@ -216,17 +245,13 @@ class RayServeWorker:
|
||||
start = time.time()
|
||||
try:
|
||||
result = await method_to_call(arg)
|
||||
self.request_counter.record(1, {"backend": self.backend_tag})
|
||||
self.request_counter.record(1)
|
||||
except Exception as e:
|
||||
result = wrap_to_ray_error(e)
|
||||
self.error_counter.record(1, {"backend": self.backend_tag})
|
||||
self.error_counter.record(1)
|
||||
|
||||
self.processing_latency_tracker.record(
|
||||
(time.time() - start) * 1000, {
|
||||
"backend": self.backend_tag,
|
||||
"replica": self.replica_tag,
|
||||
"batch_size": "1"
|
||||
})
|
||||
(time.time() - start) * 1000, tags={"batch_size": "1"})
|
||||
|
||||
return result
|
||||
|
||||
@@ -248,8 +273,7 @@ class RayServeWorker:
|
||||
"Please only send the same type of requests in batching "
|
||||
"mode.")
|
||||
|
||||
self.request_counter.record(batch_size,
|
||||
{"backend": self.backend_tag})
|
||||
self.request_counter.record(batch_size)
|
||||
|
||||
call_method = ensure_async(call_methods.pop())
|
||||
result_list = await call_method(args)
|
||||
@@ -274,15 +298,12 @@ class RayServeWorker:
|
||||
raise RayServeException(error_message)
|
||||
except Exception as e:
|
||||
wrapped_exception = wrap_to_ray_error(e)
|
||||
self.error_counter.record(1, {"backend": self.backend_tag})
|
||||
self.error_counter.record(1)
|
||||
result_list = [wrapped_exception for _ in range(batch_size)]
|
||||
|
||||
self.processing_latency_tracker.record(
|
||||
(time.time() - timing_start) * 1000, {
|
||||
"backend": self.backend_tag,
|
||||
"replica_tag": self.replica_tag,
|
||||
"batch_size": str(batch_size)
|
||||
})
|
||||
(time.time() - timing_start) * 1000,
|
||||
tags={"batch_size": str(batch_size)})
|
||||
|
||||
return result_list
|
||||
|
||||
@@ -294,21 +315,12 @@ class RayServeWorker:
|
||||
batch = await self.batch_queue.wait_for_batch()
|
||||
|
||||
# Record metrics
|
||||
self.num_queued_items.record(self.batch_queue.qsize(), {
|
||||
"backend": self.backend_tag,
|
||||
"replica_tag": self.replica_tag
|
||||
})
|
||||
self.num_processing_items.record(
|
||||
self.num_ongoing_requests - self.batch_queue.qsize(), {
|
||||
"backend": self.backend_tag,
|
||||
"replica_tag": self.replica_tag
|
||||
})
|
||||
self.num_queued_items.record(self.batch_queue.qsize())
|
||||
self.num_processing_items.record(self.num_ongoing_requests -
|
||||
self.batch_queue.qsize())
|
||||
for query in batch:
|
||||
queuing_time = (time.time() - query.tick_enter_replica) * 1000
|
||||
self.queuing_latency_tracker.record(queuing_time, {
|
||||
"backend": self.backend_tag,
|
||||
"replica_tag": self.replica_tag
|
||||
})
|
||||
self.queuing_latency_tracker.record(queuing_time)
|
||||
|
||||
all_evaluated_futures = []
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ import uvicorn
|
||||
import ray
|
||||
from ray.exceptions import RayTaskError
|
||||
from ray.serve.context import TaskContext
|
||||
from ray.experimental import metrics
|
||||
from ray.util import metrics
|
||||
from ray.serve.http_util import Response
|
||||
from ray.serve.router import Router, RequestMetadata
|
||||
|
||||
@@ -32,8 +32,9 @@ class HTTPProxy:
|
||||
self.route_table = await controller.get_router_config.remote()
|
||||
|
||||
self.request_counter = metrics.Count(
|
||||
"num_http_requests", "The number of HTTP requests processed",
|
||||
"requests", ["route"])
|
||||
"num_http_requests",
|
||||
description="The number of HTTP requests processed",
|
||||
tag_keys=("route", ))
|
||||
|
||||
self.router = Router()
|
||||
await self.router.setup(name, controller_name)
|
||||
@@ -80,7 +81,7 @@ class HTTPProxy:
|
||||
assert scope["type"] == "http"
|
||||
current_path = scope["path"]
|
||||
|
||||
self.request_counter.record(1, {"route": current_path})
|
||||
self.request_counter.record(1, tags={"route": current_path})
|
||||
|
||||
if current_path.startswith("/-/"):
|
||||
await self._handle_system_request(scope, receive, send)
|
||||
|
||||
+20
-13
@@ -9,7 +9,7 @@ from dataclasses import dataclass, field
|
||||
from ray.exceptions import RayTaskError
|
||||
|
||||
import ray
|
||||
from ray.experimental import metrics
|
||||
from ray.util import metrics
|
||||
from ray.serve.context import TaskContext
|
||||
from ray.serve.endpoint_policy import RandomEndpointPolicy
|
||||
from ray.serve.utils import logger, chain_future
|
||||
@@ -139,21 +139,25 @@ class Router:
|
||||
# -- Metrics Registration -- #
|
||||
self.num_router_requests = metrics.Count(
|
||||
"num_router_requests",
|
||||
"Number of requests processed by the router.", "requests",
|
||||
["endpoint"])
|
||||
description="Number of requests processed by the router.",
|
||||
tag_keys=("endpoint", ))
|
||||
self.num_error_endpoint_requests = metrics.Count(
|
||||
"num_error_endpoint_requests",
|
||||
("Number of requests that errored when getting results "
|
||||
"for the endpoint."), "requests", ["endpoint"])
|
||||
description=(
|
||||
"Number of requests that errored when getting results "
|
||||
"for the endpoint."),
|
||||
tag_keys=("endpoint", ))
|
||||
self.num_error_backend_requests = metrics.Count(
|
||||
"num_error_backend_requests",
|
||||
("Number of requests that errored when getting result "
|
||||
"from the backend."), "requests", ["backend"])
|
||||
description=("Number of requests that errored when getting result "
|
||||
"from the backend."),
|
||||
tag_keys=("backend", ))
|
||||
|
||||
self.backend_queue_size = metrics.Gauge(
|
||||
"backend_queued_queries",
|
||||
"Current number of queries queued in the router for a backend",
|
||||
"requests", ["backend"])
|
||||
description=("Current number of queries queued "
|
||||
"in the router for a backend"),
|
||||
tag_keys=("backend", ))
|
||||
|
||||
asyncio.get_event_loop().create_task(self.report_queue_lengths())
|
||||
|
||||
@@ -161,7 +165,7 @@ class Router:
|
||||
**request_kwargs):
|
||||
endpoint = request_meta.endpoint
|
||||
logger.debug("Received a request for endpoint {}".format(endpoint))
|
||||
self.num_router_requests.record(1, {"endpoint": endpoint})
|
||||
self.num_router_requests.record(1, tags={"endpoint": endpoint})
|
||||
|
||||
request_context = request_meta.request_context
|
||||
query = Query(
|
||||
@@ -177,7 +181,8 @@ class Router:
|
||||
try:
|
||||
result = await query.async_future
|
||||
except RayTaskError as e:
|
||||
self.num_error_endpoint_requests.record(1, {"endpoint": endpoint})
|
||||
self.num_error_endpoint_requests.record(
|
||||
1, tags={"endpoint": endpoint})
|
||||
result = e
|
||||
return result
|
||||
|
||||
@@ -301,7 +306,8 @@ class Router:
|
||||
else:
|
||||
result = await object_ref
|
||||
except RayTaskError as error:
|
||||
self.num_error_backend_requests.record(1, {"backend": backend})
|
||||
self.num_error_backend_requests.record(
|
||||
1, tags={"backend": backend})
|
||||
result = error
|
||||
self.queries_counter[backend][backend_replica_tag] -= 1
|
||||
await self.mark_worker_idle(backend, backend_replica_tag)
|
||||
@@ -358,6 +364,7 @@ class Router:
|
||||
self.name, queue_lengths)
|
||||
|
||||
for backend, length in queue_lengths.items():
|
||||
self.backend_queue_size.record(length, {"backend": backend})
|
||||
self.backend_queue_size.record(
|
||||
length, tags={"backend": backend})
|
||||
|
||||
await asyncio.sleep(REPORT_QUEUE_LENGTH_PERIOD_S)
|
||||
|
||||
Reference in New Issue
Block a user