Unconditionally retry all RPC errors on client connect (#13845)

* wip

* Update python/ray/util/client/worker.py

Co-authored-by: fangfengbin <869218239a@zju.edu.cn>

Co-authored-by: fangfengbin <869218239a@zju.edu.cn>
This commit is contained in:
Eric Liang
2021-02-02 00:10:35 -08:00
committed by GitHub
parent d71eeac2d6
commit 88ab887cc4
+5 -11
View File
@@ -101,17 +101,11 @@ class Worker:
# Note that channel_ready_future constitutes its own timeout,
# which is why we do not sleep here.
except grpc.RpcError as e:
if e.code() == grpc.StatusCode.UNAVAILABLE:
# UNAVAILABLE is gRPC's retryable error,
# so we do that here.
logger.info("Ray client server unavailable, "
f"retrying in {timeout}s...")
logger.debug(f"Received when checking init: {e.details()}")
# Ray is not ready yet, wait a timeout
time.sleep(timeout)
else:
# Any other gRPC error gets a reraise
raise e
logger.info("Ray client server unavailable, "
f"retrying in {timeout}s...")
logger.debug(f"Received when checking init: {e.details()}")
# Ray is not ready yet, wait a timeout.
time.sleep(timeout)
# Fallthrough, backoff, and retry at the top of the loop
logger.info("Waiting for Ray to become ready on the server, "
f"retry in {timeout}s...")