Revert "[CLI] Fix ray commands when RAY_ADDRESS used (#11989)" (#12135)

* Revert "[CLI] Fix ray commands when RAY_ADDRESS used (#11989)"

This reverts commit d23d326560.

* only check environment for CLI commands

* use new fns

* fixing docs

* rename and return "auto"

* Update python/ray/_private/services.py

Co-authored-by: Eric Liang <ekhliang@gmail.com>

* Update services.py

* Update services.py

Co-authored-by: Eric Liang <ekhliang@gmail.com>
This commit is contained in:
Ian Rodney
2020-11-18 22:41:10 -08:00
committed by GitHub
parent 5bc4976550
commit a74f1885db
2 changed files with 68 additions and 53 deletions
+63 -48
View File
@@ -123,57 +123,58 @@ def new_port():
def find_redis_address(address=None):
"""
Currently, this extracts the deprecated --redis-address from the command
that launched the raylet running on this node, if any. Anyone looking to
edit this function should be warned that these commands look like, for
example:
/usr/local/lib/python3.8/dist-packages/ray/core/src/ray/raylet/raylet
--redis_address=123.456.78.910 --node_ip_address=123.456.78.910
--raylet_socket_name=... --store_socket_name=... --object_manager_port=0
--min_worker_port=10000 --max_worker_port=10999 --node_manager_port=58578
--redis_port=6379 --num_initial_workers=8 --maximum_startup_concurrency=8
--static_resource_list=node:123.456.78.910,1.0,object_store_memory,66
--config_list=plasma_store_as_thread,True
--python_worker_command=/usr/bin/python
/usr/local/lib/python3.8/dist-packages/ray/workers/default_worker.py
--redis-address=123.456.78.910:6379
--node-ip-address=123.456.78.910 --node-manager-port=58578
--object-store-name=... --raylet-name=...
--config-list=plasma_store_as_thread,True --temp-dir=/tmp/ray
--metrics-agent-port=41856 --redis-password=[MASKED]
--java_worker_command= --cpp_worker_command= --redis_password=[MASKED]
--temp_dir=/tmp/ray --session_dir=... --metrics-agent-port=41856
--metrics_export_port=64229
--agent_command=/usr/bin/python
-u /usr/local/lib/python3.8/dist-packages/ray/new_dashboard/agent.py
--redis-address=123.456.78.910:6379 --metrics-export-port=64229
--dashboard-agent-port=41856 --node-manager-port=58578
--object-store-name=... --raylet-name=... --temp-dir=/tmp/ray
--log-dir=/tmp/ray/session_2020-11-08_14-29-07_199128_278000/logs
--redis-password=[MASKED] --object_store_memory=5037192806
--plasma_directory=/tmp
Longer arguments are elided with ... but all arguments from this instance
are included, to provide a sense of what is in these.
Indeed, we had to pull --redis-address to the front of each call to make
this readable.
As you can see, this is very long and complex, which is why we can't simply
extract all the the arguments using regular expressions and present a dict
as if we never lost track of these arguments, for example.
Picking out --redis-address below looks like it might grab the wrong thing,
but double-checking that we're finding the correct process by checking that
the contents look like we expect would probably be prone to choking in
unexpected ways.
Notice that --redis-address appears twice. This is not a copy-paste error;
this is the reason why the for loop below attempts to pick out every
appearance of --redis-address.
Attempts to find all valid Ray redis addresses on this node.
The --redis-address here is what is now called the --address, but it
appears in the default_worker.py and agent.py calls as --redis-address.
Returns:
Set of detected Redis instances.
"""
# Currently, this extracts the deprecated --redis-address from the command
# that launched the raylet running on this node, if any. Anyone looking to
# edit this function should be warned that these commands look like, for
# example:
# /usr/local/lib/python3.8/dist-packages/ray/core/src/ray/raylet/raylet
# --redis_address=123.456.78.910 --node_ip_address=123.456.78.910
# --raylet_socket_name=... --store_socket_name=... --object_manager_port=0
# --min_worker_port=10000 --max_worker_port=10999
# --node_manager_port=58578 --redis_port=6379 --num_initial_workers=8
# --maximum_startup_concurrency=8
# --static_resource_list=node:123.456.78.910,1.0,object_store_memory,66
# --config_list=plasma_store_as_thread,True
# --python_worker_command=/usr/bin/python
# /usr/local/lib/python3.8/dist-packages/ray/workers/default_worker.py
# --redis-address=123.456.78.910:6379
# --node-ip-address=123.456.78.910 --node-manager-port=58578
# --object-store-name=... --raylet-name=...
# --config-list=plasma_store_as_thread,True --temp-dir=/tmp/ray
# --metrics-agent-port=41856 --redis-password=[MASKED]
# --java_worker_command= --cpp_worker_command=
# --redis_password=[MASKED] --temp_dir=/tmp/ray --session_dir=...
# --metrics-agent-port=41856 --metrics_export_port=64229
# --agent_command=/usr/bin/python
# -u /usr/local/lib/python3.8/dist-packages/ray/new_dashboard/agent.py
# --redis-address=123.456.78.910:6379 --metrics-export-port=64229
# --dashboard-agent-port=41856 --node-manager-port=58578
# --object-store-name=... --raylet-name=... --temp-dir=/tmp/ray
# --log-dir=/tmp/ray/session_2020-11-08_14-29-07_199128_278000/logs
# --redis-password=[MASKED] --object_store_memory=5037192806
# --plasma_directory=/tmp
# Longer arguments are elided with ... but all arguments from this instance
# are included, to provide a sense of what is in these.
# Indeed, we had to pull --redis-address to the front of each call to make
# this readable.
# As you can see, this is very long and complex, which is why we can't
# simply extract all the the arguments using regular expressions and
# present a dict as if we never lost track of these arguments, for
# example. Picking out --redis-address below looks like it might grab the
# wrong thing, but double-checking that we're finding the correct process
# by checking that the contents look like we expect would probably be prone
# to choking in unexpected ways.
# Notice that --redis-address appears twice. This is not a copy-paste
# error; this is the reason why the for loop below attempts to pick out
# every appearance of --redis-address.
if "RAY_ADDRESS" in os.environ:
return os.environ["RAY_ADDRESS"]
# The --redis-address here is what is now called the --address, but it
# appears in the default_worker.py and agent.py calls as --redis-address.
pids = psutil.pids()
redis_addresses = set()
for pid in pids:
@@ -207,7 +208,21 @@ def find_redis_address(address=None):
return redis_addresses
def get_ray_address_to_use_or_die():
"""
Attempts to find an address for an existing Ray cluster if it is not
already specified as an environment variable.
Returns:
A string to pass into `ray.init(address=...)`
"""
if "RAY_ADDRESS" in os.environ:
return "auto" # Avoid conflict with RAY_ADDRESS env var
return find_redis_address_or_die()
def find_redis_address_or_die():
redis_addresses = find_redis_address()
if len(redis_addresses) > 1:
raise ConnectionError(
+5 -5
View File
@@ -160,7 +160,7 @@ def debug(address):
"""Show all active breakpoints and exceptions in the Ray debugger."""
from telnetlib import Telnet
if not address:
address = services.find_redis_address_or_die()
address = services.get_ray_address_to_use_or_die()
logger.info(f"Connecting to Ray instance at {address}.")
ray.init(address=address)
while True:
@@ -1309,7 +1309,7 @@ def microbenchmark():
def timeline(address):
"""Take a Chrome tracing timeline for a Ray cluster."""
if not address:
address = services.find_redis_address_or_die()
address = services.get_ray_address_to_use_or_die()
logger.info(f"Connecting to Ray instance at {address}.")
ray.init(address=address)
time = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
@@ -1337,7 +1337,7 @@ def timeline(address):
def memory(address, redis_password):
"""Print object references held in a Ray cluster."""
if not address:
address = services.find_redis_address_or_die()
address = services.get_ray_address_to_use_or_die()
logger.info(f"Connecting to Ray instance at {address}.")
ray.init(address=address, _redis_password=redis_password)
print(ray.internal.internal_api.memory_summary())
@@ -1352,7 +1352,7 @@ def memory(address, redis_password):
def status(address):
"""Print cluster status, including autoscaling info."""
if not address:
address = services.find_redis_address_or_die()
address = services.get_ray_address_to_use_or_die()
logger.info(f"Connecting to Ray instance at {address}.")
ray.init(address=address)
print(debug_status())
@@ -1367,7 +1367,7 @@ def status(address):
def global_gc(address):
"""Trigger Python garbage collection on all cluster workers."""
if not address:
address = services.find_redis_address_or_die()
address = services.get_ray_address_to_use_or_die()
logger.info(f"Connecting to Ray instance at {address}.")
ray.init(address=address)
ray.internal.internal_api.global_gc()