From 4575cd88b2b053e0fdc70031a3dd52ed0c4c7744 Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Sun, 22 Jan 2017 14:53:15 -0800 Subject: [PATCH] Improve error messages when nodes can't communicate with each other. (#223) * Good error messages when nodes can't communicate with each other * Print more information when starting the head node. * Change retries back to 5. --- python/ray/services.py | 1 + python/ray/worker.py | 4 +++- scripts/start_ray.py | 23 ++++++++++++++++++++++- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/python/ray/services.py b/python/ray/services.py index d9ac43aff..bb52f4d7c 100644 --- a/python/ray/services.py +++ b/python/ray/services.py @@ -166,6 +166,7 @@ def wait_for_redis_to_start(redis_host, redis_port, num_retries=5): while counter < num_retries: try: # Run some random command and see if it worked. + print("Waiting for redis server at {}:{} to respond...".format(redis_host, redis_port)) redis_client.client_list() except redis.ConnectionError as e: # Wait a little bit. diff --git a/python/ray/worker.py b/python/ray/worker.py index 011d4032b..4acac871f 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -674,7 +674,9 @@ def get_address_info_from_redis(redis_address, node_ip_address, num_retries=5): if counter == num_retries: raise # Some of the information may not be in Redis yet, so wait a little bit. - print("Some processes that the driver needs to connect to have not registered with Redis, so retrying.") + print("Some processes that the driver needs to connect to have not " + "registered with Redis, so retrying. Have you run " + "./scripts/start_ray.sh on this node?") time.sleep(1) counter += 1 diff --git a/scripts/start_ray.py b/scripts/start_ray.py index 1f6383fe2..823d2fe68 100644 --- a/scripts/start_ray.py +++ b/scripts/start_ray.py @@ -51,6 +51,22 @@ if __name__ == "__main__": num_workers=args.num_workers, cleanup=False, redirect_output=True) + print(address_info) + print("\nStarted Ray with {} workers on this node. A different number of " + "workers can be set with the --num-workers flag (but you have to " + "first terminate the existing cluster). You can add additional nodes " + "to the cluster by calling\n\n" + " ./scripts/start_ray.sh --redis-address {}\n\n" + "from the node you wish to add. You can connect a driver to the " + "cluster from Python by running\n\n" + " import ray\n" + " ray.init(redis_address=\"{}\")\n\n" + "If you have trouble connecting from a different machine, check that " + "your firewall is configured properly. If you wish to terminate the " + "processes that have been started, run\n\n" + " ./scripts/stop_ray.sh".format(args.num_workers, + address_info["redis_address"], + address_info["redis_address"])) else: # Start Ray on a non-head node. if args.redis_address is None: @@ -74,4 +90,9 @@ if __name__ == "__main__": num_workers=args.num_workers, cleanup=False, redirect_output=True) - print(address_info) + print(address_info) + print("\nStarted {} workers on this node. A different number of workers " + "can be set with the --num-workers flag (but you have to first " + "terminate the existing cluster). If you wish to terminate the " + "processes that have been started, run\n\n" + " ./scripts/stop_ray.sh".format(args.num_workers))