[Serve] Hotfix: Fix actor handle hashing in metric monitoring (#5886)

This commit is contained in:
Simon Mo
2019-10-11 00:31:42 -07:00
committed by Philipp Moritz
parent 523c764c25
commit 4b99cb429e
4 changed files with 12 additions and 9 deletions
@@ -98,6 +98,8 @@ class GlobalState:
logger.debug((LOG_PREFIX + "Checking if HTTP server is ready."
"{} retries left.").format(retries))
time.sleep(backoff_time_s)
# Exponential backoff
backoff_time_s *= 2
retries -= 1
if retries == 0:
raise Exception(
+2 -2
View File
@@ -27,11 +27,11 @@ class MetricMonitor:
return True
def add_target(self, target_handle):
hex_id = target_handle._ray_actor_id.hex()
hex_id = target_handle._ray_core_handle.actor_id().hex()
self.actor_handles[hex_id] = target_handle
def remove_target(self, target_handle):
hex_id = target_handle._ray_actor_id.hex()
hex_id = target_handle._ray_core_handle.actor_id().hex()
self.actor_handles.pop(hex_id)
def scrape(self):
@@ -7,19 +7,21 @@ from ray.experimental import serve
def test_e2e(serve_instance):
serve.create_endpoint("endpoint", "/api")
serve.create_endpoint("endpoint", "/api", blocking=True)
result = ray.get(
serve.global_state.kv_store_actor_handle.list_service.remote())
assert result == {"/api": "endpoint"}
retry_count = 3
retry_count = 5
timeout_sleep = 0.5
while True:
try:
resp = requests.get("http://127.0.0.1:8000/").json()
assert resp == result
break
except Exception:
time.sleep(0.5)
time.sleep(timeout_sleep)
timeout_sleep *= 2
retry_count -= 1
if retry_count == 0:
assert False, "Route table hasn't been updated after 3 tries."
@@ -38,11 +38,10 @@ def test_metric_gc(ray_instance, start_target_actor):
target_actor = start_target_actor
# this means when new scrapes are invoked, the
metric_monitor = MetricMonitor.remote(gc_window_seconds=0)
metric_monitor.add_target.remote(target_actor)
ray.get(metric_monitor.add_target.remote(target_actor))
ray.get(metric_monitor.scrape.remote())
df = ray.get(metric_monitor._get_dataframe.remote())
print(df)
assert len(df) == 102
# Old metric sould be cleared. So only 1 counter + 101 list values left.
@@ -56,10 +55,10 @@ def test_metric_system(ray_instance, start_target_actor):
metric_monitor = MetricMonitor.remote()
metric_monitor.add_target.remote(target_actor)
ray.get(metric_monitor.add_target.remote(target_actor))
# Scrape once
metric_monitor.scrape.remote()
ray.get(metric_monitor.scrape.remote())
percentiles = [50, 90, 95]
agg_windows_seconds = [60]