mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 14:48:54 +08:00
Use grpc for communication from worker to local raylet (task submission and direct actor args only) (#6118)
* Skeleton for SubmitTask proto * Pass through node manager port, connect in raylet client * Switch submit task to grpc * Check port in use * doc * Remove default port, set port randomly from driver * update * Fix test * Fix object manager test
This commit is contained in:
@@ -687,7 +687,7 @@ cdef class CoreWorker:
|
||||
|
||||
def __cinit__(self, is_driver, store_socket, raylet_socket,
|
||||
JobID job_id, GcsClientOptions gcs_options, log_dir,
|
||||
node_ip_address):
|
||||
node_ip_address, node_manager_port):
|
||||
assert pyarrow is not None, ("Expected pyarrow to be imported from "
|
||||
"outside _raylet. See __init__.py for "
|
||||
"details.")
|
||||
@@ -697,8 +697,8 @@ cdef class CoreWorker:
|
||||
LANGUAGE_PYTHON, store_socket.encode("ascii"),
|
||||
raylet_socket.encode("ascii"), job_id.native(),
|
||||
gcs_options.native()[0], log_dir.encode("utf-8"),
|
||||
node_ip_address.encode("utf-8"), task_execution_handler,
|
||||
check_signals, exit_handler))
|
||||
node_ip_address.encode("utf-8"), node_manager_port,
|
||||
task_execution_handler, check_signals, exit_handler))
|
||||
|
||||
def disconnect(self):
|
||||
with nogil:
|
||||
|
||||
@@ -55,6 +55,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
const c_string &raylet_socket, const CJobID &job_id,
|
||||
const CGcsClientOptions &gcs_options,
|
||||
const c_string &log_dir, const c_string &node_ip_address,
|
||||
int node_manager_port,
|
||||
CRayStatus (
|
||||
CTaskType task_type,
|
||||
const CRayFunction &ray_function,
|
||||
|
||||
+28
-2
@@ -10,6 +10,7 @@ import json
|
||||
import os
|
||||
import logging
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
@@ -117,7 +118,8 @@ class Node(object):
|
||||
|
||||
# If user does not provide the socket name, get it from Redis.
|
||||
if (self._plasma_store_socket_name is None
|
||||
or self._raylet_socket_name is None):
|
||||
or self._raylet_socket_name is None
|
||||
or self._ray_params.node_manager_port is None):
|
||||
# Get the address info of the processes to connect to
|
||||
# from Redis.
|
||||
address_info = ray.services.get_address_info_from_redis(
|
||||
@@ -127,6 +129,8 @@ class Node(object):
|
||||
self._plasma_store_socket_name = address_info[
|
||||
"object_store_address"]
|
||||
self._raylet_socket_name = address_info["raylet_socket_name"]
|
||||
self._ray_params.node_manager_port = address_info[
|
||||
"node_manager_port"]
|
||||
else:
|
||||
# If the user specified a socket name, use it.
|
||||
self._plasma_store_socket_name = self._prepare_socket_file(
|
||||
@@ -144,6 +148,16 @@ class Node(object):
|
||||
ray_params.include_java = (
|
||||
ray.services.include_java_from_redis(redis_client))
|
||||
|
||||
if head or not connect_only:
|
||||
# We need to start a local raylet.
|
||||
if (self._ray_params.node_manager_port is None
|
||||
or self._ray_params.node_manager_port == 0):
|
||||
# No port specified. Pick a random port for the raylet to use.
|
||||
# NOTE: There is a possible but unlikely race condition where
|
||||
# the port is bound by another process between now and when the
|
||||
# raylet starts.
|
||||
self._ray_params.node_manager_port = self._get_unused_port()
|
||||
|
||||
# Start processes.
|
||||
if head:
|
||||
self.start_head_processes()
|
||||
@@ -294,6 +308,11 @@ class Node(object):
|
||||
"""Get the node's raylet socket name."""
|
||||
return self._raylet_socket_name
|
||||
|
||||
@property
|
||||
def node_manager_port(self):
|
||||
"""Get the node manager's port."""
|
||||
return self._ray_params.node_manager_port
|
||||
|
||||
@property
|
||||
def address_info(self):
|
||||
"""Get a dictionary of addresses."""
|
||||
@@ -390,6 +409,13 @@ class Node(object):
|
||||
log_stderr_file = open(log_stderr, "a", buffering=1)
|
||||
return log_stdout_file, log_stderr_file
|
||||
|
||||
def _get_unused_port(self):
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
s.bind(("", 0))
|
||||
port = s.getsockname()[1]
|
||||
s.close()
|
||||
return port
|
||||
|
||||
def _prepare_socket_file(self, socket_path, default_prefix):
|
||||
"""Prepare the socket file for raylet and plasma.
|
||||
|
||||
@@ -508,6 +534,7 @@ class Node(object):
|
||||
process_info = ray.services.start_raylet(
|
||||
self._redis_address,
|
||||
self._node_ip_address,
|
||||
self._ray_params.node_manager_port,
|
||||
self._raylet_socket_name,
|
||||
self._plasma_store_socket_name,
|
||||
self._ray_params.worker_path,
|
||||
@@ -515,7 +542,6 @@ class Node(object):
|
||||
self._session_dir,
|
||||
self.get_resource_spec(),
|
||||
self._ray_params.object_manager_port,
|
||||
self._ray_params.node_manager_port,
|
||||
self._ray_params.redis_password,
|
||||
use_valgrind=use_valgrind,
|
||||
use_profiler=use_profiler,
|
||||
|
||||
+10
-9
@@ -153,6 +153,7 @@ def get_address_info_from_redis_helper(redis_address,
|
||||
return {
|
||||
"object_store_address": relevant_client["ObjectStoreSocketName"],
|
||||
"raylet_socket_name": relevant_client["RayletSocketName"],
|
||||
"node_manager_port": relevant_client["NodeManagerPort"]
|
||||
}
|
||||
|
||||
|
||||
@@ -1045,6 +1046,7 @@ def start_dashboard(host,
|
||||
|
||||
def start_raylet(redis_address,
|
||||
node_ip_address,
|
||||
node_manager_port,
|
||||
raylet_name,
|
||||
plasma_store_name,
|
||||
worker_path,
|
||||
@@ -1052,7 +1054,6 @@ def start_raylet(redis_address,
|
||||
session_dir,
|
||||
resource_spec,
|
||||
object_manager_port=None,
|
||||
node_manager_port=None,
|
||||
redis_password=None,
|
||||
use_valgrind=False,
|
||||
use_profiler=False,
|
||||
@@ -1068,6 +1069,8 @@ def start_raylet(redis_address,
|
||||
Args:
|
||||
redis_address (str): The address of the primary Redis server.
|
||||
node_ip_address (str): The IP address of this node.
|
||||
node_manager_port(int): The port to use for the node manager. This must
|
||||
not be 0.
|
||||
raylet_name (str): The name of the raylet socket to create.
|
||||
plasma_store_name (str): The name of the plasma store socket to connect
|
||||
to.
|
||||
@@ -1078,8 +1081,6 @@ def start_raylet(redis_address,
|
||||
resource_spec (ResourceSpec): Resources for this raylet.
|
||||
object_manager_port: The port to use for the object manager. If this is
|
||||
None, then the object manager will choose its own port.
|
||||
node_manager_port: The port to use for the node manager. If this is
|
||||
None, then the node manager will choose its own port.
|
||||
redis_password: The password to use when connecting to Redis.
|
||||
use_valgrind (bool): True if the raylet should be started inside
|
||||
of valgrind. If this is True, use_profiler must be False.
|
||||
@@ -1098,6 +1099,9 @@ def start_raylet(redis_address,
|
||||
Returns:
|
||||
ProcessInfo for the process that was started.
|
||||
"""
|
||||
# The caller must provide a node manager port so that we can correctly
|
||||
# populate the command to start a worker.
|
||||
assert node_manager_port is not None and node_manager_port != 0
|
||||
config = config or {}
|
||||
config_str = ",".join(["{},{}".format(*kv) for kv in config.items()])
|
||||
|
||||
@@ -1137,13 +1141,14 @@ def start_raylet(redis_address,
|
||||
# Create the command that the Raylet will use to start workers.
|
||||
start_worker_command = ("{} {} "
|
||||
"--node-ip-address={} "
|
||||
"--node-manager-port={} "
|
||||
"--object-store-name={} "
|
||||
"--raylet-name={} "
|
||||
"--redis-address={} "
|
||||
"--temp-dir={}".format(
|
||||
sys.executable, worker_path, node_ip_address,
|
||||
plasma_store_name, raylet_name, redis_address,
|
||||
temp_dir))
|
||||
node_manager_port, plasma_store_name,
|
||||
raylet_name, redis_address, temp_dir))
|
||||
if redis_password:
|
||||
start_worker_command += " --redis-password {}".format(redis_password)
|
||||
|
||||
@@ -1151,10 +1156,6 @@ def start_raylet(redis_address,
|
||||
# manager to choose its own port.
|
||||
if object_manager_port is None:
|
||||
object_manager_port = 0
|
||||
# If the node manager port is None, then use 0 to cause the node manager
|
||||
# to choose its own port.
|
||||
if node_manager_port is None:
|
||||
node_manager_port = 0
|
||||
|
||||
if load_code_from_local:
|
||||
start_worker_command += " --load-code-from-local "
|
||||
|
||||
@@ -92,6 +92,8 @@ class Cluster(object):
|
||||
self.webui_url = self.head_node.webui_url
|
||||
else:
|
||||
ray_params.update_if_absent(redis_address=self.redis_address)
|
||||
# Let grpc pick a port.
|
||||
ray_params.update(node_manager_port=0)
|
||||
node = ray.node.Node(
|
||||
ray_params,
|
||||
head=False,
|
||||
|
||||
@@ -1215,6 +1215,7 @@ def connect(node,
|
||||
gcs_options,
|
||||
node.get_logs_dir_path(),
|
||||
node.node_ip_address,
|
||||
node.node_manager_port,
|
||||
)
|
||||
worker.raylet_client = ray._raylet.RayletClient(worker.core_worker)
|
||||
|
||||
|
||||
@@ -19,6 +19,11 @@ parser.add_argument(
|
||||
required=True,
|
||||
type=str,
|
||||
help="the ip address of the worker's node")
|
||||
parser.add_argument(
|
||||
"--node-manager-port",
|
||||
required=True,
|
||||
type=int,
|
||||
help="the port of the worker's node")
|
||||
parser.add_argument(
|
||||
"--redis-address",
|
||||
required=True,
|
||||
@@ -74,6 +79,7 @@ if __name__ == "__main__":
|
||||
|
||||
ray_params = RayParams(
|
||||
node_ip_address=args.node_ip_address,
|
||||
node_manager_port=args.node_manager_port,
|
||||
redis_address=args.redis_address,
|
||||
redis_password=args.redis_password,
|
||||
plasma_store_socket_name=args.object_store_name,
|
||||
|
||||
Reference in New Issue
Block a user