diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index 7a76e4e0e..cad37d513 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -123,57 +123,58 @@ def new_port(): def find_redis_address(address=None): """ - Currently, this extracts the deprecated --redis-address from the command - that launched the raylet running on this node, if any. Anyone looking to - edit this function should be warned that these commands look like, for - example: - /usr/local/lib/python3.8/dist-packages/ray/core/src/ray/raylet/raylet - --redis_address=123.456.78.910 --node_ip_address=123.456.78.910 - --raylet_socket_name=... --store_socket_name=... --object_manager_port=0 - --min_worker_port=10000 --max_worker_port=10999 --node_manager_port=58578 - --redis_port=6379 --num_initial_workers=8 --maximum_startup_concurrency=8 - --static_resource_list=node:123.456.78.910,1.0,object_store_memory,66 - --config_list=plasma_store_as_thread,True - --python_worker_command=/usr/bin/python - /usr/local/lib/python3.8/dist-packages/ray/workers/default_worker.py - --redis-address=123.456.78.910:6379 - --node-ip-address=123.456.78.910 --node-manager-port=58578 - --object-store-name=... --raylet-name=... - --config-list=plasma_store_as_thread,True --temp-dir=/tmp/ray - --metrics-agent-port=41856 --redis-password=[MASKED] - --java_worker_command= --cpp_worker_command= --redis_password=[MASKED] - --temp_dir=/tmp/ray --session_dir=... --metrics-agent-port=41856 - --metrics_export_port=64229 - --agent_command=/usr/bin/python - -u /usr/local/lib/python3.8/dist-packages/ray/new_dashboard/agent.py - --redis-address=123.456.78.910:6379 --metrics-export-port=64229 - --dashboard-agent-port=41856 --node-manager-port=58578 - --object-store-name=... --raylet-name=... --temp-dir=/tmp/ray - --log-dir=/tmp/ray/session_2020-11-08_14-29-07_199128_278000/logs - --redis-password=[MASKED] --object_store_memory=5037192806 - --plasma_directory=/tmp - Longer arguments are elided with ... but all arguments from this instance - are included, to provide a sense of what is in these. - Indeed, we had to pull --redis-address to the front of each call to make - this readable. - As you can see, this is very long and complex, which is why we can't simply - extract all the the arguments using regular expressions and present a dict - as if we never lost track of these arguments, for example. - Picking out --redis-address below looks like it might grab the wrong thing, - but double-checking that we're finding the correct process by checking that - the contents look like we expect would probably be prone to choking in - unexpected ways. - Notice that --redis-address appears twice. This is not a copy-paste error; - this is the reason why the for loop below attempts to pick out every - appearance of --redis-address. + Attempts to find all valid Ray redis addresses on this node. - The --redis-address here is what is now called the --address, but it - appears in the default_worker.py and agent.py calls as --redis-address. + Returns: + Set of detected Redis instances. """ + # Currently, this extracts the deprecated --redis-address from the command + # that launched the raylet running on this node, if any. Anyone looking to + # edit this function should be warned that these commands look like, for + # example: + # /usr/local/lib/python3.8/dist-packages/ray/core/src/ray/raylet/raylet + # --redis_address=123.456.78.910 --node_ip_address=123.456.78.910 + # --raylet_socket_name=... --store_socket_name=... --object_manager_port=0 + # --min_worker_port=10000 --max_worker_port=10999 + # --node_manager_port=58578 --redis_port=6379 --num_initial_workers=8 + # --maximum_startup_concurrency=8 + # --static_resource_list=node:123.456.78.910,1.0,object_store_memory,66 + # --config_list=plasma_store_as_thread,True + # --python_worker_command=/usr/bin/python + # /usr/local/lib/python3.8/dist-packages/ray/workers/default_worker.py + # --redis-address=123.456.78.910:6379 + # --node-ip-address=123.456.78.910 --node-manager-port=58578 + # --object-store-name=... --raylet-name=... + # --config-list=plasma_store_as_thread,True --temp-dir=/tmp/ray + # --metrics-agent-port=41856 --redis-password=[MASKED] + # --java_worker_command= --cpp_worker_command= + # --redis_password=[MASKED] --temp_dir=/tmp/ray --session_dir=... + # --metrics-agent-port=41856 --metrics_export_port=64229 + # --agent_command=/usr/bin/python + # -u /usr/local/lib/python3.8/dist-packages/ray/new_dashboard/agent.py + # --redis-address=123.456.78.910:6379 --metrics-export-port=64229 + # --dashboard-agent-port=41856 --node-manager-port=58578 + # --object-store-name=... --raylet-name=... --temp-dir=/tmp/ray + # --log-dir=/tmp/ray/session_2020-11-08_14-29-07_199128_278000/logs + # --redis-password=[MASKED] --object_store_memory=5037192806 + # --plasma_directory=/tmp + # Longer arguments are elided with ... but all arguments from this instance + # are included, to provide a sense of what is in these. + # Indeed, we had to pull --redis-address to the front of each call to make + # this readable. + # As you can see, this is very long and complex, which is why we can't + # simply extract all the the arguments using regular expressions and + # present a dict as if we never lost track of these arguments, for + # example. Picking out --redis-address below looks like it might grab the + # wrong thing, but double-checking that we're finding the correct process + # by checking that the contents look like we expect would probably be prone + # to choking in unexpected ways. + # Notice that --redis-address appears twice. This is not a copy-paste + # error; this is the reason why the for loop below attempts to pick out + # every appearance of --redis-address. - if "RAY_ADDRESS" in os.environ: - return os.environ["RAY_ADDRESS"] - + # The --redis-address here is what is now called the --address, but it + # appears in the default_worker.py and agent.py calls as --redis-address. pids = psutil.pids() redis_addresses = set() for pid in pids: @@ -207,7 +208,21 @@ def find_redis_address(address=None): return redis_addresses +def get_ray_address_to_use_or_die(): + """ + Attempts to find an address for an existing Ray cluster if it is not + already specified as an environment variable. + Returns: + A string to pass into `ray.init(address=...)` + """ + if "RAY_ADDRESS" in os.environ: + return "auto" # Avoid conflict with RAY_ADDRESS env var + + return find_redis_address_or_die() + + def find_redis_address_or_die(): + redis_addresses = find_redis_address() if len(redis_addresses) > 1: raise ConnectionError( diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index cd3ba418f..09d021902 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -160,7 +160,7 @@ def debug(address): """Show all active breakpoints and exceptions in the Ray debugger.""" from telnetlib import Telnet if not address: - address = services.find_redis_address_or_die() + address = services.get_ray_address_to_use_or_die() logger.info(f"Connecting to Ray instance at {address}.") ray.init(address=address) while True: @@ -1309,7 +1309,7 @@ def microbenchmark(): def timeline(address): """Take a Chrome tracing timeline for a Ray cluster.""" if not address: - address = services.find_redis_address_or_die() + address = services.get_ray_address_to_use_or_die() logger.info(f"Connecting to Ray instance at {address}.") ray.init(address=address) time = datetime.today().strftime("%Y-%m-%d_%H-%M-%S") @@ -1337,7 +1337,7 @@ def timeline(address): def memory(address, redis_password): """Print object references held in a Ray cluster.""" if not address: - address = services.find_redis_address_or_die() + address = services.get_ray_address_to_use_or_die() logger.info(f"Connecting to Ray instance at {address}.") ray.init(address=address, _redis_password=redis_password) print(ray.internal.internal_api.memory_summary()) @@ -1352,7 +1352,7 @@ def memory(address, redis_password): def status(address): """Print cluster status, including autoscaling info.""" if not address: - address = services.find_redis_address_or_die() + address = services.get_ray_address_to_use_or_die() logger.info(f"Connecting to Ray instance at {address}.") ray.init(address=address) print(debug_status()) @@ -1367,7 +1367,7 @@ def status(address): def global_gc(address): """Trigger Python garbage collection on all cluster workers.""" if not address: - address = services.find_redis_address_or_die() + address = services.get_ray_address_to_use_or_die() logger.info(f"Connecting to Ray instance at {address}.") ray.init(address=address) ray.internal.internal_api.global_gc()