mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 19:00:36 +08:00
Improve error messages when nodes can't communicate with each other. (#223)
* Good error messages when nodes can't communicate with each other * Print more information when starting the head node. * Change retries back to 5.
This commit is contained in:
committed by
Philipp Moritz
parent
7151ed5cdf
commit
4575cd88b2
@@ -166,6 +166,7 @@ def wait_for_redis_to_start(redis_host, redis_port, num_retries=5):
|
||||
while counter < num_retries:
|
||||
try:
|
||||
# Run some random command and see if it worked.
|
||||
print("Waiting for redis server at {}:{} to respond...".format(redis_host, redis_port))
|
||||
redis_client.client_list()
|
||||
except redis.ConnectionError as e:
|
||||
# Wait a little bit.
|
||||
|
||||
@@ -674,7 +674,9 @@ def get_address_info_from_redis(redis_address, node_ip_address, num_retries=5):
|
||||
if counter == num_retries:
|
||||
raise
|
||||
# Some of the information may not be in Redis yet, so wait a little bit.
|
||||
print("Some processes that the driver needs to connect to have not registered with Redis, so retrying.")
|
||||
print("Some processes that the driver needs to connect to have not "
|
||||
"registered with Redis, so retrying. Have you run "
|
||||
"./scripts/start_ray.sh on this node?")
|
||||
time.sleep(1)
|
||||
counter += 1
|
||||
|
||||
|
||||
+22
-1
@@ -51,6 +51,22 @@ if __name__ == "__main__":
|
||||
num_workers=args.num_workers,
|
||||
cleanup=False,
|
||||
redirect_output=True)
|
||||
print(address_info)
|
||||
print("\nStarted Ray with {} workers on this node. A different number of "
|
||||
"workers can be set with the --num-workers flag (but you have to "
|
||||
"first terminate the existing cluster). You can add additional nodes "
|
||||
"to the cluster by calling\n\n"
|
||||
" ./scripts/start_ray.sh --redis-address {}\n\n"
|
||||
"from the node you wish to add. You can connect a driver to the "
|
||||
"cluster from Python by running\n\n"
|
||||
" import ray\n"
|
||||
" ray.init(redis_address=\"{}\")\n\n"
|
||||
"If you have trouble connecting from a different machine, check that "
|
||||
"your firewall is configured properly. If you wish to terminate the "
|
||||
"processes that have been started, run\n\n"
|
||||
" ./scripts/stop_ray.sh".format(args.num_workers,
|
||||
address_info["redis_address"],
|
||||
address_info["redis_address"]))
|
||||
else:
|
||||
# Start Ray on a non-head node.
|
||||
if args.redis_address is None:
|
||||
@@ -74,4 +90,9 @@ if __name__ == "__main__":
|
||||
num_workers=args.num_workers,
|
||||
cleanup=False,
|
||||
redirect_output=True)
|
||||
print(address_info)
|
||||
print(address_info)
|
||||
print("\nStarted {} workers on this node. A different number of workers "
|
||||
"can be set with the --num-workers flag (but you have to first "
|
||||
"terminate the existing cluster). If you wish to terminate the "
|
||||
"processes that have been started, run\n\n"
|
||||
" ./scripts/stop_ray.sh".format(args.num_workers))
|
||||
|
||||
Reference in New Issue
Block a user