Use grpc for communication from worker to local raylet (task submission and direct actor args only) (#6118)

* Skeleton for SubmitTask proto

* Pass through node manager port, connect in raylet client

* Switch submit task to grpc

* Check port in use

* doc

* Remove default port, set port randomly from driver

* update

* Fix test

* Fix object manager test
This commit is contained in:
Stephanie Wang
2019-11-11 21:17:25 -08:00
committed by GitHub
parent f48293f96d
commit 35d177f459
23 changed files with 257 additions and 93 deletions
+3 -3
View File
@@ -687,7 +687,7 @@ cdef class CoreWorker:
def __cinit__(self, is_driver, store_socket, raylet_socket,
JobID job_id, GcsClientOptions gcs_options, log_dir,
node_ip_address):
node_ip_address, node_manager_port):
assert pyarrow is not None, ("Expected pyarrow to be imported from "
"outside _raylet. See __init__.py for "
"details.")
@@ -697,8 +697,8 @@ cdef class CoreWorker:
LANGUAGE_PYTHON, store_socket.encode("ascii"),
raylet_socket.encode("ascii"), job_id.native(),
gcs_options.native()[0], log_dir.encode("utf-8"),
node_ip_address.encode("utf-8"), task_execution_handler,
check_signals, exit_handler))
node_ip_address.encode("utf-8"), node_manager_port,
task_execution_handler, check_signals, exit_handler))
def disconnect(self):
with nogil:
+1
View File
@@ -55,6 +55,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
const c_string &raylet_socket, const CJobID &job_id,
const CGcsClientOptions &gcs_options,
const c_string &log_dir, const c_string &node_ip_address,
int node_manager_port,
CRayStatus (
CTaskType task_type,
const CRayFunction &ray_function,
+28 -2
View File
@@ -10,6 +10,7 @@ import json
import os
import logging
import signal
import socket
import sys
import tempfile
import threading
@@ -117,7 +118,8 @@ class Node(object):
# If user does not provide the socket name, get it from Redis.
if (self._plasma_store_socket_name is None
or self._raylet_socket_name is None):
or self._raylet_socket_name is None
or self._ray_params.node_manager_port is None):
# Get the address info of the processes to connect to
# from Redis.
address_info = ray.services.get_address_info_from_redis(
@@ -127,6 +129,8 @@ class Node(object):
self._plasma_store_socket_name = address_info[
"object_store_address"]
self._raylet_socket_name = address_info["raylet_socket_name"]
self._ray_params.node_manager_port = address_info[
"node_manager_port"]
else:
# If the user specified a socket name, use it.
self._plasma_store_socket_name = self._prepare_socket_file(
@@ -144,6 +148,16 @@ class Node(object):
ray_params.include_java = (
ray.services.include_java_from_redis(redis_client))
if head or not connect_only:
# We need to start a local raylet.
if (self._ray_params.node_manager_port is None
or self._ray_params.node_manager_port == 0):
# No port specified. Pick a random port for the raylet to use.
# NOTE: There is a possible but unlikely race condition where
# the port is bound by another process between now and when the
# raylet starts.
self._ray_params.node_manager_port = self._get_unused_port()
# Start processes.
if head:
self.start_head_processes()
@@ -294,6 +308,11 @@ class Node(object):
"""Get the node's raylet socket name."""
return self._raylet_socket_name
@property
def node_manager_port(self):
"""Get the node manager's port."""
return self._ray_params.node_manager_port
@property
def address_info(self):
"""Get a dictionary of addresses."""
@@ -390,6 +409,13 @@ class Node(object):
log_stderr_file = open(log_stderr, "a", buffering=1)
return log_stdout_file, log_stderr_file
def _get_unused_port(self):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(("", 0))
port = s.getsockname()[1]
s.close()
return port
def _prepare_socket_file(self, socket_path, default_prefix):
"""Prepare the socket file for raylet and plasma.
@@ -508,6 +534,7 @@ class Node(object):
process_info = ray.services.start_raylet(
self._redis_address,
self._node_ip_address,
self._ray_params.node_manager_port,
self._raylet_socket_name,
self._plasma_store_socket_name,
self._ray_params.worker_path,
@@ -515,7 +542,6 @@ class Node(object):
self._session_dir,
self.get_resource_spec(),
self._ray_params.object_manager_port,
self._ray_params.node_manager_port,
self._ray_params.redis_password,
use_valgrind=use_valgrind,
use_profiler=use_profiler,
+10 -9
View File
@@ -153,6 +153,7 @@ def get_address_info_from_redis_helper(redis_address,
return {
"object_store_address": relevant_client["ObjectStoreSocketName"],
"raylet_socket_name": relevant_client["RayletSocketName"],
"node_manager_port": relevant_client["NodeManagerPort"]
}
@@ -1045,6 +1046,7 @@ def start_dashboard(host,
def start_raylet(redis_address,
node_ip_address,
node_manager_port,
raylet_name,
plasma_store_name,
worker_path,
@@ -1052,7 +1054,6 @@ def start_raylet(redis_address,
session_dir,
resource_spec,
object_manager_port=None,
node_manager_port=None,
redis_password=None,
use_valgrind=False,
use_profiler=False,
@@ -1068,6 +1069,8 @@ def start_raylet(redis_address,
Args:
redis_address (str): The address of the primary Redis server.
node_ip_address (str): The IP address of this node.
node_manager_port(int): The port to use for the node manager. This must
not be 0.
raylet_name (str): The name of the raylet socket to create.
plasma_store_name (str): The name of the plasma store socket to connect
to.
@@ -1078,8 +1081,6 @@ def start_raylet(redis_address,
resource_spec (ResourceSpec): Resources for this raylet.
object_manager_port: The port to use for the object manager. If this is
None, then the object manager will choose its own port.
node_manager_port: The port to use for the node manager. If this is
None, then the node manager will choose its own port.
redis_password: The password to use when connecting to Redis.
use_valgrind (bool): True if the raylet should be started inside
of valgrind. If this is True, use_profiler must be False.
@@ -1098,6 +1099,9 @@ def start_raylet(redis_address,
Returns:
ProcessInfo for the process that was started.
"""
# The caller must provide a node manager port so that we can correctly
# populate the command to start a worker.
assert node_manager_port is not None and node_manager_port != 0
config = config or {}
config_str = ",".join(["{},{}".format(*kv) for kv in config.items()])
@@ -1137,13 +1141,14 @@ def start_raylet(redis_address,
# Create the command that the Raylet will use to start workers.
start_worker_command = ("{} {} "
"--node-ip-address={} "
"--node-manager-port={} "
"--object-store-name={} "
"--raylet-name={} "
"--redis-address={} "
"--temp-dir={}".format(
sys.executable, worker_path, node_ip_address,
plasma_store_name, raylet_name, redis_address,
temp_dir))
node_manager_port, plasma_store_name,
raylet_name, redis_address, temp_dir))
if redis_password:
start_worker_command += " --redis-password {}".format(redis_password)
@@ -1151,10 +1156,6 @@ def start_raylet(redis_address,
# manager to choose its own port.
if object_manager_port is None:
object_manager_port = 0
# If the node manager port is None, then use 0 to cause the node manager
# to choose its own port.
if node_manager_port is None:
node_manager_port = 0
if load_code_from_local:
start_worker_command += " --load-code-from-local "
+2
View File
@@ -92,6 +92,8 @@ class Cluster(object):
self.webui_url = self.head_node.webui_url
else:
ray_params.update_if_absent(redis_address=self.redis_address)
# Let grpc pick a port.
ray_params.update(node_manager_port=0)
node = ray.node.Node(
ray_params,
head=False,
+1
View File
@@ -1215,6 +1215,7 @@ def connect(node,
gcs_options,
node.get_logs_dir_path(),
node.node_ip_address,
node.node_manager_port,
)
worker.raylet_client = ray._raylet.RayletClient(worker.core_worker)
+6
View File
@@ -19,6 +19,11 @@ parser.add_argument(
required=True,
type=str,
help="the ip address of the worker's node")
parser.add_argument(
"--node-manager-port",
required=True,
type=int,
help="the port of the worker's node")
parser.add_argument(
"--redis-address",
required=True,
@@ -74,6 +79,7 @@ if __name__ == "__main__":
ray_params = RayParams(
node_ip_address=args.node_ip_address,
node_manager_port=args.node_manager_port,
redis_address=args.redis_address,
redis_password=args.redis_password,
plasma_store_socket_name=args.object_store_name,