[Core] Added ability to specify different IP addresses for a core worker and its raylet. (#7985)

This commit is contained in:
Clark Zinzow
2020-04-16 09:32:24 -06:00
committed by GitHub
parent d0fab84e4d
commit d4cae5f632
26 changed files with 170 additions and 71 deletions
+3 -2
View File
@@ -630,8 +630,8 @@ cdef class CoreWorker:
def __cinit__(self, is_driver, store_socket, raylet_socket,
JobID job_id, GcsClientOptions gcs_options, log_dir,
node_ip_address, node_manager_port, local_mode,
driver_name, stdout_file, stderr_file):
node_ip_address, node_manager_port, raylet_ip_address,
local_mode, driver_name, stdout_file, stderr_file):
self.is_driver = is_driver
self.is_local_mode = local_mode
@@ -647,6 +647,7 @@ cdef class CoreWorker:
options.install_failure_signal_handler = True
options.node_ip_address = node_ip_address.encode("utf-8")
options.node_manager_port = node_manager_port
options.raylet_ip_address = raylet_ip_address.encode("utf-8")
options.driver_name = driver_name
options.stdout_file = stdout_file
options.stderr_file = stderr_file
+1
View File
@@ -195,6 +195,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
c_bool install_failure_signal_handler
c_string node_ip_address
int node_manager_port
c_string raylet_ip_address
c_string driver_name
c_string stdout_file
c_string stderr_file
+28 -9
View File
@@ -80,6 +80,19 @@ class Node:
node_ip_address = ray.services.get_node_ip_address()
self._node_ip_address = node_ip_address
if ray_params.raylet_ip_address:
raylet_ip_address = ray_params.raylet_ip_address
else:
raylet_ip_address = node_ip_address
if raylet_ip_address != node_ip_address and (not connect_only or head):
raise ValueError(
"The raylet IP address should only be different than the node "
"IP address when connecting to an existing raylet; i.e., when "
"head=False and connect_only=True.")
self._raylet_ip_address = raylet_ip_address
ray_params.update_if_absent(
include_log_monitor=True,
resources={},
@@ -122,7 +135,7 @@ class Node:
# from Redis.
address_info = ray.services.get_address_info_from_redis(
self.redis_address,
self._node_ip_address,
self._raylet_ip_address,
redis_password=self.redis_password)
self._plasma_store_socket_name = address_info[
"object_store_address"]
@@ -229,9 +242,14 @@ class Node:
@property
def node_ip_address(self):
"""Get the cluster Redis address."""
"""Get the IP address of this node."""
return self._node_ip_address
@property
def raylet_ip_address(self):
"""Get the IP address of the raylet that this node connects to."""
return self._raylet_ip_address
@property
def address(self):
"""Get the cluster address."""
@@ -287,6 +305,7 @@ class Node:
"""Get a dictionary of addresses."""
return {
"node_ip_address": self._node_ip_address,
"raylet_ip_address": self._raylet_ip_address,
"redis_address": self._redis_address,
"object_store_address": self._plasma_store_socket_name,
"raylet_socket_name": self._raylet_socket_name,
@@ -429,7 +448,7 @@ class Node:
assert ray_constants.PROCESS_TYPE_REAPER not in self.all_processes
if process_info is not None:
self.all_processes[ray_constants.PROCESS_TYPE_REAPER] = [
process_info
process_info,
]
def start_redis(self):
@@ -469,7 +488,7 @@ class Node:
fate_share=self.kernel_fate_share)
assert ray_constants.PROCESS_TYPE_LOG_MONITOR not in self.all_processes
self.all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] = [
process_info
process_info,
]
def start_reporter(self):
@@ -484,7 +503,7 @@ class Node:
assert ray_constants.PROCESS_TYPE_REPORTER not in self.all_processes
if process_info is not None:
self.all_processes[ray_constants.PROCESS_TYPE_REPORTER] = [
process_info
process_info,
]
def start_dashboard(self, require_webui):
@@ -508,7 +527,7 @@ class Node:
assert ray_constants.PROCESS_TYPE_DASHBOARD not in self.all_processes
if process_info is not None:
self.all_processes[ray_constants.PROCESS_TYPE_DASHBOARD] = [
process_info
process_info,
]
redis_client = self.create_redis_client()
redis_client.hmset("webui", {"url": self._webui_url})
@@ -527,7 +546,7 @@ class Node:
assert (
ray_constants.PROCESS_TYPE_PLASMA_STORE not in self.all_processes)
self.all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] = [
process_info
process_info,
]
def start_gcs_server(self):
@@ -544,7 +563,7 @@ class Node:
assert (
ray_constants.PROCESS_TYPE_GCS_SERVER not in self.all_processes)
self.all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER] = [
process_info
process_info,
]
def start_raylet(self, use_valgrind=False, use_profiler=False):
@@ -617,7 +636,7 @@ class Node:
assert (ray_constants.PROCESS_TYPE_RAYLET_MONITOR not in
self.all_processes)
self.all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR] = [
process_info
process_info,
]
def start_head_processes(self):
+4
View File
@@ -33,6 +33,8 @@ class RayParams:
object_manager_port int: The port to use for the object manager.
node_manager_port: The port to use for the node manager.
node_ip_address (str): The IP address of the node that we are on.
raylet_ip_address (str): The IP address of the raylet that this node
connects to.
object_id_seed (int): Used to seed the deterministic generation of
object IDs. The same value can be used across multiple runs of the
same job in order to generate the object IDs in a consistent
@@ -95,6 +97,7 @@ class RayParams:
object_manager_port=None,
node_manager_port=None,
node_ip_address=None,
raylet_ip_address=None,
object_id_seed=None,
driver_mode=None,
redirect_worker_output=None,
@@ -131,6 +134,7 @@ class RayParams:
self.object_manager_port = object_manager_port
self.node_manager_port = node_manager_port
self.node_ip_address = node_ip_address
self.raylet_ip_address = raylet_ip_address
self.driver_mode = driver_mode
self.redirect_worker_output = redirect_worker_output
self.redirect_output = redirect_output
+59 -24
View File
@@ -73,8 +73,14 @@ DEFAULT_JAVA_WORKER_CLASSPATH = [
logger = logging.getLogger(__name__)
ProcessInfo = collections.namedtuple("ProcessInfo", [
"process", "stdout_file", "stderr_file", "use_valgrind", "use_gdb",
"use_valgrind_profiler", "use_perftools_profiler", "use_tmux"
"process",
"stdout_file",
"stderr_file",
"use_valgrind",
"use_gdb",
"use_valgrind_profiler",
"use_perftools_profiler",
"use_tmux",
])
@@ -189,7 +195,7 @@ def get_address_info_from_redis_helper(redis_address,
return {
"object_store_address": relevant_client["ObjectStoreSocketName"],
"raylet_socket_name": relevant_client["RayletSocketName"],
"node_manager_port": relevant_client["NodeManagerPort"]
"node_manager_port": relevant_client["NodeManagerPort"],
}
@@ -430,9 +436,12 @@ def start_ray_process(command,
logger.info("Detected environment variable '%s'.", gdb_env_var)
use_gdb = True
if sum(
[use_gdb, use_valgrind, use_valgrind_profiler, use_perftools_profiler
]) > 1:
if sum([
use_gdb,
use_valgrind,
use_valgrind_profiler,
use_perftools_profiler,
]) > 1:
raise ValueError(
"At most one of the 'use_gdb', 'use_valgrind', "
"'use_valgrind_profiler', and 'use_perftools_profiler' flags can "
@@ -463,9 +472,12 @@ def start_ray_process(command,
if use_valgrind:
command = [
"valgrind", "--track-origins=yes", "--leak-check=full",
"--show-leak-kinds=all", "--leak-check-heuristics=stdstring",
"--error-exitcode=1"
"valgrind",
"--track-origins=yes",
"--leak-check=full",
"--show-leak-kinds=all",
"--leak-check-heuristics=stdstring",
"--error-exitcode=1",
] + command
if use_valgrind_profiler:
@@ -1023,9 +1035,11 @@ def start_log_monitor(redis_address,
log_monitor_filepath = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "log_monitor.py")
command = [
sys.executable, "-u", log_monitor_filepath,
sys.executable,
"-u",
log_monitor_filepath,
"--redis-address={}".format(redis_address),
"--logs-dir={}".format(logs_dir)
"--logs-dir={}".format(logs_dir),
]
if redis_password:
command += ["--redis-password", redis_password]
@@ -1059,8 +1073,10 @@ def start_reporter(redis_address,
reporter_filepath = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "reporter.py")
command = [
sys.executable, "-u", reporter_filepath,
"--redis-address={}".format(redis_address)
sys.executable,
"-u",
reporter_filepath,
"--redis-address={}".format(redis_address),
]
if redis_password:
command += ["--redis-password", redis_password]
@@ -1114,9 +1130,13 @@ def start_dashboard(require_webui,
dashboard_filepath = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "dashboard/dashboard.py")
command = [
sys.executable, "-u", dashboard_filepath, "--host={}".format(host),
"--port={}".format(port), "--redis-address={}".format(redis_address),
"--temp-dir={}".format(temp_dir)
sys.executable,
"-u",
dashboard_filepath,
"--host={}".format(host),
"--port={}".format(port),
"--redis-address={}".format(redis_address),
"--temp-dir={}".format(temp_dir),
]
if redis_password:
command += ["--redis-password", redis_password]
@@ -1290,13 +1310,15 @@ def start_raylet(redis_address,
# Create the command that the Raylet will use to start workers.
start_worker_command = [
sys.executable, worker_path,
sys.executable,
worker_path,
"--node-ip-address={}".format(node_ip_address),
"--node-manager-port={}".format(node_manager_port),
"--object-store-name={}".format(plasma_store_name),
"--raylet-name={}".format(raylet_name),
"--redis-address={}".format(redis_address),
"--config-list={}".format(config_str), "--temp-dir={}".format(temp_dir)
"--config-list={}".format(config_str),
"--temp-dir={}".format(temp_dir),
]
if redis_password:
start_worker_command += ["--redis-password={}".format(redis_password)]
@@ -1540,8 +1562,11 @@ def _start_plasma_store(plasma_store_memory,
plasma_store_memory = int(plasma_store_memory)
command = [
PLASMA_STORE_EXECUTABLE, "-s", socket_name, "-m",
str(plasma_store_memory)
PLASMA_STORE_EXECUTABLE,
"-s",
socket_name,
"-m",
str(plasma_store_memory),
]
if plasma_directory is not None:
command += ["-d", plasma_directory]
@@ -1617,6 +1642,7 @@ def start_worker(node_ip_address,
redis_address,
worker_path,
temp_dir,
raylet_ip_address=None,
stdout_file=None,
stderr_file=None,
fate_share=None):
@@ -1631,6 +1657,8 @@ def start_worker(node_ip_address,
worker_path (str): The path of the source code which the worker process
will run.
temp_dir (str): The path of the temp dir.
raylet_ip_address (str): The IP address of the worker's raylet. If not
provided, it defaults to the node_ip_address.
stdout_file: A file handle opened for writing to redirect stdout to. If
no redirection should happen, then this should be None.
stderr_file: A file handle opened for writing to redirect stderr to. If
@@ -1640,12 +1668,17 @@ def start_worker(node_ip_address,
ProcessInfo for the process that was started.
"""
command = [
sys.executable, "-u", worker_path,
sys.executable,
"-u",
worker_path,
"--node-ip-address=" + node_ip_address,
"--object-store-name=" + object_store_name,
"--raylet-name=" + raylet_name,
"--redis-address=" + str(redis_address), "--temp-dir=" + temp_dir
"--redis-address=" + str(redis_address),
"--temp-dir=" + temp_dir,
]
if raylet_ip_address is not None:
command.append("--raylet-ip-address=" + raylet_ip_address)
process_info = start_ray_process(
command,
ray_constants.PROCESS_TYPE_WORKER,
@@ -1678,8 +1711,10 @@ def start_monitor(redis_address,
monitor_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "monitor.py")
command = [
sys.executable, "-u", monitor_path,
"--redis-address=" + str(redis_address)
sys.executable,
"-u",
monitor_path,
"--redis-address=" + str(redis_address),
]
if autoscaling_config:
command.append("--autoscaling-config=" + str(autoscaling_config))
+10 -5
View File
@@ -355,7 +355,7 @@ class Worker:
"job_id": self.current_job_id.binary(),
"function_id": function_to_run_id,
"function": pickled_function,
"run_on_other_drivers": str(run_on_other_drivers)
"run_on_other_drivers": str(run_on_other_drivers),
})
self.redis_client.rpush("Exports", key)
# TODO(rkn): If the worker fails after it calls setnx and before it
@@ -689,6 +689,8 @@ def init(address=None,
if node_ip_address is not None:
node_ip_address = services.address_to_ip(node_ip_address)
raylet_ip_address = node_ip_address
_internal_config = (json.loads(_internal_config)
if _internal_config else {})
# Set the internal config options for LRU eviction.
@@ -708,6 +710,7 @@ def init(address=None,
redis_address=redis_address,
redis_port=redis_port,
node_ip_address=node_ip_address,
raylet_ip_address=raylet_ip_address,
object_id_seed=object_id_seed,
driver_mode=driver_mode,
redirect_worker_output=redirect_worker_output,
@@ -788,6 +791,7 @@ def init(address=None,
# In this case, we only need to connect the node.
ray_params = ray.parameter.RayParams(
node_ip_address=node_ip_address,
raylet_ip_address=raylet_ip_address,
redis_address=redis_address,
redis_password=redis_password,
object_id_seed=object_id_seed,
@@ -1053,7 +1057,7 @@ def listen_error_messages_raylet(worker, task_error_queue, threads_stopped):
job_id = error_data.job_id
if job_id not in [
worker.current_job_id.binary(),
JobID.nil().binary()
JobID.nil().binary(),
]:
continue
@@ -1226,6 +1230,7 @@ def connect(node,
int(redis_port),
node.redis_password,
)
worker.core_worker = ray._raylet.CoreWorker(
(mode == SCRIPT_MODE or mode == LOCAL_MODE),
node.plasma_store_socket_name,
@@ -1235,6 +1240,7 @@ def connect(node,
node.get_logs_dir_path(),
node.node_ip_address,
node.node_manager_port,
node.raylet_ip_address,
(mode == LOCAL_MODE),
driver_name,
log_stdout_file_name,
@@ -1575,9 +1581,8 @@ def wait(object_ids, num_returns=1, timeout=None):
blocking_wait_inside_async_warned = True
if isinstance(object_ids, ObjectID):
raise TypeError(
"wait() expected a list of ray.ObjectID, got a single ray.ObjectID"
)
raise TypeError("wait() expected a list of ray.ObjectID, got a single "
"ray.ObjectID")
if not isinstance(object_ids, list):
raise TypeError(
+11
View File
@@ -21,6 +21,12 @@ parser.add_argument(
required=True,
type=int,
help="the port of the worker's node")
parser.add_argument(
"--raylet-ip-address",
required=False,
type=str,
default=None,
help="the ip address of the worker's raylet")
parser.add_argument(
"--redis-address",
required=True,
@@ -89,8 +95,13 @@ if __name__ == "__main__":
internal_config[config_list[i]] = config_list[i + 1]
i += 2
raylet_ip_address = args.raylet_ip_address
if raylet_ip_address is None:
raylet_ip_address = args.node_ip_address
ray_params = RayParams(
node_ip_address=args.node_ip_address,
raylet_ip_address=raylet_ip_address,
node_manager_port=args.node_manager_port,
redis_address=args.redis_address,
redis_password=args.redis_password,