mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 17:04:56 +08:00
[Core] Added ability to specify different IP addresses for a core worker and its raylet. (#7985)
This commit is contained in:
@@ -630,8 +630,8 @@ cdef class CoreWorker:
|
||||
|
||||
def __cinit__(self, is_driver, store_socket, raylet_socket,
|
||||
JobID job_id, GcsClientOptions gcs_options, log_dir,
|
||||
node_ip_address, node_manager_port, local_mode,
|
||||
driver_name, stdout_file, stderr_file):
|
||||
node_ip_address, node_manager_port, raylet_ip_address,
|
||||
local_mode, driver_name, stdout_file, stderr_file):
|
||||
self.is_driver = is_driver
|
||||
self.is_local_mode = local_mode
|
||||
|
||||
@@ -647,6 +647,7 @@ cdef class CoreWorker:
|
||||
options.install_failure_signal_handler = True
|
||||
options.node_ip_address = node_ip_address.encode("utf-8")
|
||||
options.node_manager_port = node_manager_port
|
||||
options.raylet_ip_address = raylet_ip_address.encode("utf-8")
|
||||
options.driver_name = driver_name
|
||||
options.stdout_file = stdout_file
|
||||
options.stderr_file = stderr_file
|
||||
|
||||
@@ -195,6 +195,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
c_bool install_failure_signal_handler
|
||||
c_string node_ip_address
|
||||
int node_manager_port
|
||||
c_string raylet_ip_address
|
||||
c_string driver_name
|
||||
c_string stdout_file
|
||||
c_string stderr_file
|
||||
|
||||
+28
-9
@@ -80,6 +80,19 @@ class Node:
|
||||
node_ip_address = ray.services.get_node_ip_address()
|
||||
self._node_ip_address = node_ip_address
|
||||
|
||||
if ray_params.raylet_ip_address:
|
||||
raylet_ip_address = ray_params.raylet_ip_address
|
||||
else:
|
||||
raylet_ip_address = node_ip_address
|
||||
|
||||
if raylet_ip_address != node_ip_address and (not connect_only or head):
|
||||
raise ValueError(
|
||||
"The raylet IP address should only be different than the node "
|
||||
"IP address when connecting to an existing raylet; i.e., when "
|
||||
"head=False and connect_only=True.")
|
||||
|
||||
self._raylet_ip_address = raylet_ip_address
|
||||
|
||||
ray_params.update_if_absent(
|
||||
include_log_monitor=True,
|
||||
resources={},
|
||||
@@ -122,7 +135,7 @@ class Node:
|
||||
# from Redis.
|
||||
address_info = ray.services.get_address_info_from_redis(
|
||||
self.redis_address,
|
||||
self._node_ip_address,
|
||||
self._raylet_ip_address,
|
||||
redis_password=self.redis_password)
|
||||
self._plasma_store_socket_name = address_info[
|
||||
"object_store_address"]
|
||||
@@ -229,9 +242,14 @@ class Node:
|
||||
|
||||
@property
|
||||
def node_ip_address(self):
|
||||
"""Get the cluster Redis address."""
|
||||
"""Get the IP address of this node."""
|
||||
return self._node_ip_address
|
||||
|
||||
@property
|
||||
def raylet_ip_address(self):
|
||||
"""Get the IP address of the raylet that this node connects to."""
|
||||
return self._raylet_ip_address
|
||||
|
||||
@property
|
||||
def address(self):
|
||||
"""Get the cluster address."""
|
||||
@@ -287,6 +305,7 @@ class Node:
|
||||
"""Get a dictionary of addresses."""
|
||||
return {
|
||||
"node_ip_address": self._node_ip_address,
|
||||
"raylet_ip_address": self._raylet_ip_address,
|
||||
"redis_address": self._redis_address,
|
||||
"object_store_address": self._plasma_store_socket_name,
|
||||
"raylet_socket_name": self._raylet_socket_name,
|
||||
@@ -429,7 +448,7 @@ class Node:
|
||||
assert ray_constants.PROCESS_TYPE_REAPER not in self.all_processes
|
||||
if process_info is not None:
|
||||
self.all_processes[ray_constants.PROCESS_TYPE_REAPER] = [
|
||||
process_info
|
||||
process_info,
|
||||
]
|
||||
|
||||
def start_redis(self):
|
||||
@@ -469,7 +488,7 @@ class Node:
|
||||
fate_share=self.kernel_fate_share)
|
||||
assert ray_constants.PROCESS_TYPE_LOG_MONITOR not in self.all_processes
|
||||
self.all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] = [
|
||||
process_info
|
||||
process_info,
|
||||
]
|
||||
|
||||
def start_reporter(self):
|
||||
@@ -484,7 +503,7 @@ class Node:
|
||||
assert ray_constants.PROCESS_TYPE_REPORTER not in self.all_processes
|
||||
if process_info is not None:
|
||||
self.all_processes[ray_constants.PROCESS_TYPE_REPORTER] = [
|
||||
process_info
|
||||
process_info,
|
||||
]
|
||||
|
||||
def start_dashboard(self, require_webui):
|
||||
@@ -508,7 +527,7 @@ class Node:
|
||||
assert ray_constants.PROCESS_TYPE_DASHBOARD not in self.all_processes
|
||||
if process_info is not None:
|
||||
self.all_processes[ray_constants.PROCESS_TYPE_DASHBOARD] = [
|
||||
process_info
|
||||
process_info,
|
||||
]
|
||||
redis_client = self.create_redis_client()
|
||||
redis_client.hmset("webui", {"url": self._webui_url})
|
||||
@@ -527,7 +546,7 @@ class Node:
|
||||
assert (
|
||||
ray_constants.PROCESS_TYPE_PLASMA_STORE not in self.all_processes)
|
||||
self.all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] = [
|
||||
process_info
|
||||
process_info,
|
||||
]
|
||||
|
||||
def start_gcs_server(self):
|
||||
@@ -544,7 +563,7 @@ class Node:
|
||||
assert (
|
||||
ray_constants.PROCESS_TYPE_GCS_SERVER not in self.all_processes)
|
||||
self.all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER] = [
|
||||
process_info
|
||||
process_info,
|
||||
]
|
||||
|
||||
def start_raylet(self, use_valgrind=False, use_profiler=False):
|
||||
@@ -617,7 +636,7 @@ class Node:
|
||||
assert (ray_constants.PROCESS_TYPE_RAYLET_MONITOR not in
|
||||
self.all_processes)
|
||||
self.all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR] = [
|
||||
process_info
|
||||
process_info,
|
||||
]
|
||||
|
||||
def start_head_processes(self):
|
||||
|
||||
@@ -33,6 +33,8 @@ class RayParams:
|
||||
object_manager_port int: The port to use for the object manager.
|
||||
node_manager_port: The port to use for the node manager.
|
||||
node_ip_address (str): The IP address of the node that we are on.
|
||||
raylet_ip_address (str): The IP address of the raylet that this node
|
||||
connects to.
|
||||
object_id_seed (int): Used to seed the deterministic generation of
|
||||
object IDs. The same value can be used across multiple runs of the
|
||||
same job in order to generate the object IDs in a consistent
|
||||
@@ -95,6 +97,7 @@ class RayParams:
|
||||
object_manager_port=None,
|
||||
node_manager_port=None,
|
||||
node_ip_address=None,
|
||||
raylet_ip_address=None,
|
||||
object_id_seed=None,
|
||||
driver_mode=None,
|
||||
redirect_worker_output=None,
|
||||
@@ -131,6 +134,7 @@ class RayParams:
|
||||
self.object_manager_port = object_manager_port
|
||||
self.node_manager_port = node_manager_port
|
||||
self.node_ip_address = node_ip_address
|
||||
self.raylet_ip_address = raylet_ip_address
|
||||
self.driver_mode = driver_mode
|
||||
self.redirect_worker_output = redirect_worker_output
|
||||
self.redirect_output = redirect_output
|
||||
|
||||
+59
-24
@@ -73,8 +73,14 @@ DEFAULT_JAVA_WORKER_CLASSPATH = [
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ProcessInfo = collections.namedtuple("ProcessInfo", [
|
||||
"process", "stdout_file", "stderr_file", "use_valgrind", "use_gdb",
|
||||
"use_valgrind_profiler", "use_perftools_profiler", "use_tmux"
|
||||
"process",
|
||||
"stdout_file",
|
||||
"stderr_file",
|
||||
"use_valgrind",
|
||||
"use_gdb",
|
||||
"use_valgrind_profiler",
|
||||
"use_perftools_profiler",
|
||||
"use_tmux",
|
||||
])
|
||||
|
||||
|
||||
@@ -189,7 +195,7 @@ def get_address_info_from_redis_helper(redis_address,
|
||||
return {
|
||||
"object_store_address": relevant_client["ObjectStoreSocketName"],
|
||||
"raylet_socket_name": relevant_client["RayletSocketName"],
|
||||
"node_manager_port": relevant_client["NodeManagerPort"]
|
||||
"node_manager_port": relevant_client["NodeManagerPort"],
|
||||
}
|
||||
|
||||
|
||||
@@ -430,9 +436,12 @@ def start_ray_process(command,
|
||||
logger.info("Detected environment variable '%s'.", gdb_env_var)
|
||||
use_gdb = True
|
||||
|
||||
if sum(
|
||||
[use_gdb, use_valgrind, use_valgrind_profiler, use_perftools_profiler
|
||||
]) > 1:
|
||||
if sum([
|
||||
use_gdb,
|
||||
use_valgrind,
|
||||
use_valgrind_profiler,
|
||||
use_perftools_profiler,
|
||||
]) > 1:
|
||||
raise ValueError(
|
||||
"At most one of the 'use_gdb', 'use_valgrind', "
|
||||
"'use_valgrind_profiler', and 'use_perftools_profiler' flags can "
|
||||
@@ -463,9 +472,12 @@ def start_ray_process(command,
|
||||
|
||||
if use_valgrind:
|
||||
command = [
|
||||
"valgrind", "--track-origins=yes", "--leak-check=full",
|
||||
"--show-leak-kinds=all", "--leak-check-heuristics=stdstring",
|
||||
"--error-exitcode=1"
|
||||
"valgrind",
|
||||
"--track-origins=yes",
|
||||
"--leak-check=full",
|
||||
"--show-leak-kinds=all",
|
||||
"--leak-check-heuristics=stdstring",
|
||||
"--error-exitcode=1",
|
||||
] + command
|
||||
|
||||
if use_valgrind_profiler:
|
||||
@@ -1023,9 +1035,11 @@ def start_log_monitor(redis_address,
|
||||
log_monitor_filepath = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "log_monitor.py")
|
||||
command = [
|
||||
sys.executable, "-u", log_monitor_filepath,
|
||||
sys.executable,
|
||||
"-u",
|
||||
log_monitor_filepath,
|
||||
"--redis-address={}".format(redis_address),
|
||||
"--logs-dir={}".format(logs_dir)
|
||||
"--logs-dir={}".format(logs_dir),
|
||||
]
|
||||
if redis_password:
|
||||
command += ["--redis-password", redis_password]
|
||||
@@ -1059,8 +1073,10 @@ def start_reporter(redis_address,
|
||||
reporter_filepath = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "reporter.py")
|
||||
command = [
|
||||
sys.executable, "-u", reporter_filepath,
|
||||
"--redis-address={}".format(redis_address)
|
||||
sys.executable,
|
||||
"-u",
|
||||
reporter_filepath,
|
||||
"--redis-address={}".format(redis_address),
|
||||
]
|
||||
if redis_password:
|
||||
command += ["--redis-password", redis_password]
|
||||
@@ -1114,9 +1130,13 @@ def start_dashboard(require_webui,
|
||||
dashboard_filepath = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "dashboard/dashboard.py")
|
||||
command = [
|
||||
sys.executable, "-u", dashboard_filepath, "--host={}".format(host),
|
||||
"--port={}".format(port), "--redis-address={}".format(redis_address),
|
||||
"--temp-dir={}".format(temp_dir)
|
||||
sys.executable,
|
||||
"-u",
|
||||
dashboard_filepath,
|
||||
"--host={}".format(host),
|
||||
"--port={}".format(port),
|
||||
"--redis-address={}".format(redis_address),
|
||||
"--temp-dir={}".format(temp_dir),
|
||||
]
|
||||
if redis_password:
|
||||
command += ["--redis-password", redis_password]
|
||||
@@ -1290,13 +1310,15 @@ def start_raylet(redis_address,
|
||||
|
||||
# Create the command that the Raylet will use to start workers.
|
||||
start_worker_command = [
|
||||
sys.executable, worker_path,
|
||||
sys.executable,
|
||||
worker_path,
|
||||
"--node-ip-address={}".format(node_ip_address),
|
||||
"--node-manager-port={}".format(node_manager_port),
|
||||
"--object-store-name={}".format(plasma_store_name),
|
||||
"--raylet-name={}".format(raylet_name),
|
||||
"--redis-address={}".format(redis_address),
|
||||
"--config-list={}".format(config_str), "--temp-dir={}".format(temp_dir)
|
||||
"--config-list={}".format(config_str),
|
||||
"--temp-dir={}".format(temp_dir),
|
||||
]
|
||||
if redis_password:
|
||||
start_worker_command += ["--redis-password={}".format(redis_password)]
|
||||
@@ -1540,8 +1562,11 @@ def _start_plasma_store(plasma_store_memory,
|
||||
plasma_store_memory = int(plasma_store_memory)
|
||||
|
||||
command = [
|
||||
PLASMA_STORE_EXECUTABLE, "-s", socket_name, "-m",
|
||||
str(plasma_store_memory)
|
||||
PLASMA_STORE_EXECUTABLE,
|
||||
"-s",
|
||||
socket_name,
|
||||
"-m",
|
||||
str(plasma_store_memory),
|
||||
]
|
||||
if plasma_directory is not None:
|
||||
command += ["-d", plasma_directory]
|
||||
@@ -1617,6 +1642,7 @@ def start_worker(node_ip_address,
|
||||
redis_address,
|
||||
worker_path,
|
||||
temp_dir,
|
||||
raylet_ip_address=None,
|
||||
stdout_file=None,
|
||||
stderr_file=None,
|
||||
fate_share=None):
|
||||
@@ -1631,6 +1657,8 @@ def start_worker(node_ip_address,
|
||||
worker_path (str): The path of the source code which the worker process
|
||||
will run.
|
||||
temp_dir (str): The path of the temp dir.
|
||||
raylet_ip_address (str): The IP address of the worker's raylet. If not
|
||||
provided, it defaults to the node_ip_address.
|
||||
stdout_file: A file handle opened for writing to redirect stdout to. If
|
||||
no redirection should happen, then this should be None.
|
||||
stderr_file: A file handle opened for writing to redirect stderr to. If
|
||||
@@ -1640,12 +1668,17 @@ def start_worker(node_ip_address,
|
||||
ProcessInfo for the process that was started.
|
||||
"""
|
||||
command = [
|
||||
sys.executable, "-u", worker_path,
|
||||
sys.executable,
|
||||
"-u",
|
||||
worker_path,
|
||||
"--node-ip-address=" + node_ip_address,
|
||||
"--object-store-name=" + object_store_name,
|
||||
"--raylet-name=" + raylet_name,
|
||||
"--redis-address=" + str(redis_address), "--temp-dir=" + temp_dir
|
||||
"--redis-address=" + str(redis_address),
|
||||
"--temp-dir=" + temp_dir,
|
||||
]
|
||||
if raylet_ip_address is not None:
|
||||
command.append("--raylet-ip-address=" + raylet_ip_address)
|
||||
process_info = start_ray_process(
|
||||
command,
|
||||
ray_constants.PROCESS_TYPE_WORKER,
|
||||
@@ -1678,8 +1711,10 @@ def start_monitor(redis_address,
|
||||
monitor_path = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "monitor.py")
|
||||
command = [
|
||||
sys.executable, "-u", monitor_path,
|
||||
"--redis-address=" + str(redis_address)
|
||||
sys.executable,
|
||||
"-u",
|
||||
monitor_path,
|
||||
"--redis-address=" + str(redis_address),
|
||||
]
|
||||
if autoscaling_config:
|
||||
command.append("--autoscaling-config=" + str(autoscaling_config))
|
||||
|
||||
+10
-5
@@ -355,7 +355,7 @@ class Worker:
|
||||
"job_id": self.current_job_id.binary(),
|
||||
"function_id": function_to_run_id,
|
||||
"function": pickled_function,
|
||||
"run_on_other_drivers": str(run_on_other_drivers)
|
||||
"run_on_other_drivers": str(run_on_other_drivers),
|
||||
})
|
||||
self.redis_client.rpush("Exports", key)
|
||||
# TODO(rkn): If the worker fails after it calls setnx and before it
|
||||
@@ -689,6 +689,8 @@ def init(address=None,
|
||||
if node_ip_address is not None:
|
||||
node_ip_address = services.address_to_ip(node_ip_address)
|
||||
|
||||
raylet_ip_address = node_ip_address
|
||||
|
||||
_internal_config = (json.loads(_internal_config)
|
||||
if _internal_config else {})
|
||||
# Set the internal config options for LRU eviction.
|
||||
@@ -708,6 +710,7 @@ def init(address=None,
|
||||
redis_address=redis_address,
|
||||
redis_port=redis_port,
|
||||
node_ip_address=node_ip_address,
|
||||
raylet_ip_address=raylet_ip_address,
|
||||
object_id_seed=object_id_seed,
|
||||
driver_mode=driver_mode,
|
||||
redirect_worker_output=redirect_worker_output,
|
||||
@@ -788,6 +791,7 @@ def init(address=None,
|
||||
# In this case, we only need to connect the node.
|
||||
ray_params = ray.parameter.RayParams(
|
||||
node_ip_address=node_ip_address,
|
||||
raylet_ip_address=raylet_ip_address,
|
||||
redis_address=redis_address,
|
||||
redis_password=redis_password,
|
||||
object_id_seed=object_id_seed,
|
||||
@@ -1053,7 +1057,7 @@ def listen_error_messages_raylet(worker, task_error_queue, threads_stopped):
|
||||
job_id = error_data.job_id
|
||||
if job_id not in [
|
||||
worker.current_job_id.binary(),
|
||||
JobID.nil().binary()
|
||||
JobID.nil().binary(),
|
||||
]:
|
||||
continue
|
||||
|
||||
@@ -1226,6 +1230,7 @@ def connect(node,
|
||||
int(redis_port),
|
||||
node.redis_password,
|
||||
)
|
||||
|
||||
worker.core_worker = ray._raylet.CoreWorker(
|
||||
(mode == SCRIPT_MODE or mode == LOCAL_MODE),
|
||||
node.plasma_store_socket_name,
|
||||
@@ -1235,6 +1240,7 @@ def connect(node,
|
||||
node.get_logs_dir_path(),
|
||||
node.node_ip_address,
|
||||
node.node_manager_port,
|
||||
node.raylet_ip_address,
|
||||
(mode == LOCAL_MODE),
|
||||
driver_name,
|
||||
log_stdout_file_name,
|
||||
@@ -1575,9 +1581,8 @@ def wait(object_ids, num_returns=1, timeout=None):
|
||||
blocking_wait_inside_async_warned = True
|
||||
|
||||
if isinstance(object_ids, ObjectID):
|
||||
raise TypeError(
|
||||
"wait() expected a list of ray.ObjectID, got a single ray.ObjectID"
|
||||
)
|
||||
raise TypeError("wait() expected a list of ray.ObjectID, got a single "
|
||||
"ray.ObjectID")
|
||||
|
||||
if not isinstance(object_ids, list):
|
||||
raise TypeError(
|
||||
|
||||
@@ -21,6 +21,12 @@ parser.add_argument(
|
||||
required=True,
|
||||
type=int,
|
||||
help="the port of the worker's node")
|
||||
parser.add_argument(
|
||||
"--raylet-ip-address",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="the ip address of the worker's raylet")
|
||||
parser.add_argument(
|
||||
"--redis-address",
|
||||
required=True,
|
||||
@@ -89,8 +95,13 @@ if __name__ == "__main__":
|
||||
internal_config[config_list[i]] = config_list[i + 1]
|
||||
i += 2
|
||||
|
||||
raylet_ip_address = args.raylet_ip_address
|
||||
if raylet_ip_address is None:
|
||||
raylet_ip_address = args.node_ip_address
|
||||
|
||||
ray_params = RayParams(
|
||||
node_ip_address=args.node_ip_address,
|
||||
raylet_ip_address=raylet_ip_address,
|
||||
node_manager_port=args.node_manager_port,
|
||||
redis_address=args.redis_address,
|
||||
redis_password=args.redis_password,
|
||||
|
||||
Reference in New Issue
Block a user