mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 11:21:15 +08:00
* Revert "[CLI] Fix ray commands when RAY_ADDRESS used (#11989)"
This reverts commit d23d326560.
* only check environment for CLI commands
* use new fns
* fixing docs
* rename and return "auto"
* Update python/ray/_private/services.py
Co-authored-by: Eric Liang <ekhliang@gmail.com>
* Update services.py
* Update services.py
Co-authored-by: Eric Liang <ekhliang@gmail.com>
This commit is contained in:
@@ -123,57 +123,58 @@ def new_port():
|
||||
|
||||
def find_redis_address(address=None):
|
||||
"""
|
||||
Currently, this extracts the deprecated --redis-address from the command
|
||||
that launched the raylet running on this node, if any. Anyone looking to
|
||||
edit this function should be warned that these commands look like, for
|
||||
example:
|
||||
/usr/local/lib/python3.8/dist-packages/ray/core/src/ray/raylet/raylet
|
||||
--redis_address=123.456.78.910 --node_ip_address=123.456.78.910
|
||||
--raylet_socket_name=... --store_socket_name=... --object_manager_port=0
|
||||
--min_worker_port=10000 --max_worker_port=10999 --node_manager_port=58578
|
||||
--redis_port=6379 --num_initial_workers=8 --maximum_startup_concurrency=8
|
||||
--static_resource_list=node:123.456.78.910,1.0,object_store_memory,66
|
||||
--config_list=plasma_store_as_thread,True
|
||||
--python_worker_command=/usr/bin/python
|
||||
/usr/local/lib/python3.8/dist-packages/ray/workers/default_worker.py
|
||||
--redis-address=123.456.78.910:6379
|
||||
--node-ip-address=123.456.78.910 --node-manager-port=58578
|
||||
--object-store-name=... --raylet-name=...
|
||||
--config-list=plasma_store_as_thread,True --temp-dir=/tmp/ray
|
||||
--metrics-agent-port=41856 --redis-password=[MASKED]
|
||||
--java_worker_command= --cpp_worker_command= --redis_password=[MASKED]
|
||||
--temp_dir=/tmp/ray --session_dir=... --metrics-agent-port=41856
|
||||
--metrics_export_port=64229
|
||||
--agent_command=/usr/bin/python
|
||||
-u /usr/local/lib/python3.8/dist-packages/ray/new_dashboard/agent.py
|
||||
--redis-address=123.456.78.910:6379 --metrics-export-port=64229
|
||||
--dashboard-agent-port=41856 --node-manager-port=58578
|
||||
--object-store-name=... --raylet-name=... --temp-dir=/tmp/ray
|
||||
--log-dir=/tmp/ray/session_2020-11-08_14-29-07_199128_278000/logs
|
||||
--redis-password=[MASKED] --object_store_memory=5037192806
|
||||
--plasma_directory=/tmp
|
||||
Longer arguments are elided with ... but all arguments from this instance
|
||||
are included, to provide a sense of what is in these.
|
||||
Indeed, we had to pull --redis-address to the front of each call to make
|
||||
this readable.
|
||||
As you can see, this is very long and complex, which is why we can't simply
|
||||
extract all the the arguments using regular expressions and present a dict
|
||||
as if we never lost track of these arguments, for example.
|
||||
Picking out --redis-address below looks like it might grab the wrong thing,
|
||||
but double-checking that we're finding the correct process by checking that
|
||||
the contents look like we expect would probably be prone to choking in
|
||||
unexpected ways.
|
||||
Notice that --redis-address appears twice. This is not a copy-paste error;
|
||||
this is the reason why the for loop below attempts to pick out every
|
||||
appearance of --redis-address.
|
||||
Attempts to find all valid Ray redis addresses on this node.
|
||||
|
||||
The --redis-address here is what is now called the --address, but it
|
||||
appears in the default_worker.py and agent.py calls as --redis-address.
|
||||
Returns:
|
||||
Set of detected Redis instances.
|
||||
"""
|
||||
# Currently, this extracts the deprecated --redis-address from the command
|
||||
# that launched the raylet running on this node, if any. Anyone looking to
|
||||
# edit this function should be warned that these commands look like, for
|
||||
# example:
|
||||
# /usr/local/lib/python3.8/dist-packages/ray/core/src/ray/raylet/raylet
|
||||
# --redis_address=123.456.78.910 --node_ip_address=123.456.78.910
|
||||
# --raylet_socket_name=... --store_socket_name=... --object_manager_port=0
|
||||
# --min_worker_port=10000 --max_worker_port=10999
|
||||
# --node_manager_port=58578 --redis_port=6379 --num_initial_workers=8
|
||||
# --maximum_startup_concurrency=8
|
||||
# --static_resource_list=node:123.456.78.910,1.0,object_store_memory,66
|
||||
# --config_list=plasma_store_as_thread,True
|
||||
# --python_worker_command=/usr/bin/python
|
||||
# /usr/local/lib/python3.8/dist-packages/ray/workers/default_worker.py
|
||||
# --redis-address=123.456.78.910:6379
|
||||
# --node-ip-address=123.456.78.910 --node-manager-port=58578
|
||||
# --object-store-name=... --raylet-name=...
|
||||
# --config-list=plasma_store_as_thread,True --temp-dir=/tmp/ray
|
||||
# --metrics-agent-port=41856 --redis-password=[MASKED]
|
||||
# --java_worker_command= --cpp_worker_command=
|
||||
# --redis_password=[MASKED] --temp_dir=/tmp/ray --session_dir=...
|
||||
# --metrics-agent-port=41856 --metrics_export_port=64229
|
||||
# --agent_command=/usr/bin/python
|
||||
# -u /usr/local/lib/python3.8/dist-packages/ray/new_dashboard/agent.py
|
||||
# --redis-address=123.456.78.910:6379 --metrics-export-port=64229
|
||||
# --dashboard-agent-port=41856 --node-manager-port=58578
|
||||
# --object-store-name=... --raylet-name=... --temp-dir=/tmp/ray
|
||||
# --log-dir=/tmp/ray/session_2020-11-08_14-29-07_199128_278000/logs
|
||||
# --redis-password=[MASKED] --object_store_memory=5037192806
|
||||
# --plasma_directory=/tmp
|
||||
# Longer arguments are elided with ... but all arguments from this instance
|
||||
# are included, to provide a sense of what is in these.
|
||||
# Indeed, we had to pull --redis-address to the front of each call to make
|
||||
# this readable.
|
||||
# As you can see, this is very long and complex, which is why we can't
|
||||
# simply extract all the the arguments using regular expressions and
|
||||
# present a dict as if we never lost track of these arguments, for
|
||||
# example. Picking out --redis-address below looks like it might grab the
|
||||
# wrong thing, but double-checking that we're finding the correct process
|
||||
# by checking that the contents look like we expect would probably be prone
|
||||
# to choking in unexpected ways.
|
||||
# Notice that --redis-address appears twice. This is not a copy-paste
|
||||
# error; this is the reason why the for loop below attempts to pick out
|
||||
# every appearance of --redis-address.
|
||||
|
||||
if "RAY_ADDRESS" in os.environ:
|
||||
return os.environ["RAY_ADDRESS"]
|
||||
|
||||
# The --redis-address here is what is now called the --address, but it
|
||||
# appears in the default_worker.py and agent.py calls as --redis-address.
|
||||
pids = psutil.pids()
|
||||
redis_addresses = set()
|
||||
for pid in pids:
|
||||
@@ -207,7 +208,21 @@ def find_redis_address(address=None):
|
||||
return redis_addresses
|
||||
|
||||
|
||||
def get_ray_address_to_use_or_die():
|
||||
"""
|
||||
Attempts to find an address for an existing Ray cluster if it is not
|
||||
already specified as an environment variable.
|
||||
Returns:
|
||||
A string to pass into `ray.init(address=...)`
|
||||
"""
|
||||
if "RAY_ADDRESS" in os.environ:
|
||||
return "auto" # Avoid conflict with RAY_ADDRESS env var
|
||||
|
||||
return find_redis_address_or_die()
|
||||
|
||||
|
||||
def find_redis_address_or_die():
|
||||
|
||||
redis_addresses = find_redis_address()
|
||||
if len(redis_addresses) > 1:
|
||||
raise ConnectionError(
|
||||
|
||||
@@ -160,7 +160,7 @@ def debug(address):
|
||||
"""Show all active breakpoints and exceptions in the Ray debugger."""
|
||||
from telnetlib import Telnet
|
||||
if not address:
|
||||
address = services.find_redis_address_or_die()
|
||||
address = services.get_ray_address_to_use_or_die()
|
||||
logger.info(f"Connecting to Ray instance at {address}.")
|
||||
ray.init(address=address)
|
||||
while True:
|
||||
@@ -1309,7 +1309,7 @@ def microbenchmark():
|
||||
def timeline(address):
|
||||
"""Take a Chrome tracing timeline for a Ray cluster."""
|
||||
if not address:
|
||||
address = services.find_redis_address_or_die()
|
||||
address = services.get_ray_address_to_use_or_die()
|
||||
logger.info(f"Connecting to Ray instance at {address}.")
|
||||
ray.init(address=address)
|
||||
time = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
@@ -1337,7 +1337,7 @@ def timeline(address):
|
||||
def memory(address, redis_password):
|
||||
"""Print object references held in a Ray cluster."""
|
||||
if not address:
|
||||
address = services.find_redis_address_or_die()
|
||||
address = services.get_ray_address_to_use_or_die()
|
||||
logger.info(f"Connecting to Ray instance at {address}.")
|
||||
ray.init(address=address, _redis_password=redis_password)
|
||||
print(ray.internal.internal_api.memory_summary())
|
||||
@@ -1352,7 +1352,7 @@ def memory(address, redis_password):
|
||||
def status(address):
|
||||
"""Print cluster status, including autoscaling info."""
|
||||
if not address:
|
||||
address = services.find_redis_address_or_die()
|
||||
address = services.get_ray_address_to_use_or_die()
|
||||
logger.info(f"Connecting to Ray instance at {address}.")
|
||||
ray.init(address=address)
|
||||
print(debug_status())
|
||||
@@ -1367,7 +1367,7 @@ def status(address):
|
||||
def global_gc(address):
|
||||
"""Trigger Python garbage collection on all cluster workers."""
|
||||
if not address:
|
||||
address = services.find_redis_address_or_die()
|
||||
address = services.get_ray_address_to_use_or_die()
|
||||
logger.info(f"Connecting to Ray instance at {address}.")
|
||||
ray.init(address=address)
|
||||
ray.internal.internal_api.global_gc()
|
||||
|
||||
Reference in New Issue
Block a user