mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 13:02:16 +08:00
[xray] Integrate worker.py with raylet. (#1810)
* Integrate worker with raylet. * Begin allowing worker to attach to cluster. * Fix linting and documentation. * Fix linting. * Comment tests back in. * Fix type of worker command. * Remove xray python files and tests. * Fix from rebase. * Add test. * Copy over raylet executable. * Small cleanup.
This commit is contained in:
committed by
Philipp Moritz
parent
0fc989c6c1
commit
fbfbb1c079
@@ -86,11 +86,13 @@ def cli():
|
||||
help="enable support for huge pages in the object store")
|
||||
@click.option("--autoscaling-config", required=False, type=str,
|
||||
help="the file that contains the autoscaling config")
|
||||
@click.option("--use-raylet", is_flag=True, default=False,
|
||||
help="use the raylet code path, this is not supported yet")
|
||||
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
|
||||
redis_max_clients, redis_shard_ports, object_manager_port,
|
||||
object_store_memory, num_workers, num_cpus, num_gpus, resources,
|
||||
head, no_ui, block, plasma_directory, huge_pages,
|
||||
autoscaling_config):
|
||||
autoscaling_config, use_raylet):
|
||||
# Convert hostnames to numerical IP address.
|
||||
if node_ip_address is not None:
|
||||
node_ip_address = services.address_to_ip(node_ip_address)
|
||||
@@ -161,7 +163,8 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
|
||||
include_webui=(not no_ui),
|
||||
plasma_directory=plasma_directory,
|
||||
huge_pages=huge_pages,
|
||||
autoscaling_config=autoscaling_config)
|
||||
autoscaling_config=autoscaling_config,
|
||||
use_raylet=use_raylet)
|
||||
print(address_info)
|
||||
print("\nStarted Ray on this node. You can add additional nodes to "
|
||||
"the cluster by calling\n\n"
|
||||
@@ -227,7 +230,8 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
|
||||
redirect_output=True,
|
||||
resources=resources,
|
||||
plasma_directory=plasma_directory,
|
||||
huge_pages=huge_pages)
|
||||
huge_pages=huge_pages,
|
||||
use_raylet=use_raylet)
|
||||
print(address_info)
|
||||
print("\nStarted Ray on this node. If you wish to terminate the "
|
||||
"processes that have been started, run\n\n"
|
||||
@@ -242,7 +246,7 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
|
||||
@click.command()
|
||||
def stop():
|
||||
subprocess.call(["killall global_scheduler plasma_store plasma_manager "
|
||||
"local_scheduler"], shell=True)
|
||||
"local_scheduler raylet"], shell=True)
|
||||
|
||||
# Find the PID of the monitor process and kill it.
|
||||
subprocess.call(["kill $(ps aux | grep monitor.py | grep -v grep | "
|
||||
|
||||
+194
-87
@@ -28,6 +28,7 @@ import ray.global_scheduler as global_scheduler
|
||||
PROCESS_TYPE_MONITOR = "monitor"
|
||||
PROCESS_TYPE_LOG_MONITOR = "log_monitor"
|
||||
PROCESS_TYPE_WORKER = "worker"
|
||||
PROCESS_TYPE_RAYLET = "raylet"
|
||||
PROCESS_TYPE_LOCAL_SCHEDULER = "local_scheduler"
|
||||
PROCESS_TYPE_PLASMA_MANAGER = "plasma_manager"
|
||||
PROCESS_TYPE_PLASMA_STORE = "plasma_store"
|
||||
@@ -43,6 +44,7 @@ PROCESS_TYPE_WEB_UI = "web_ui"
|
||||
all_processes = OrderedDict([(PROCESS_TYPE_MONITOR, []),
|
||||
(PROCESS_TYPE_LOG_MONITOR, []),
|
||||
(PROCESS_TYPE_WORKER, []),
|
||||
(PROCESS_TYPE_RAYLET, []),
|
||||
(PROCESS_TYPE_LOCAL_SCHEDULER, []),
|
||||
(PROCESS_TYPE_PLASMA_MANAGER, []),
|
||||
(PROCESS_TYPE_PLASMA_STORE, []),
|
||||
@@ -51,6 +53,7 @@ all_processes = OrderedDict([(PROCESS_TYPE_MONITOR, []),
|
||||
(PROCESS_TYPE_WEB_UI, [])],)
|
||||
|
||||
# True if processes are run in the valgrind profiler.
|
||||
RUN_RAYLET_PROFILER = False
|
||||
RUN_LOCAL_SCHEDULER_PROFILER = False
|
||||
RUN_PLASMA_MANAGER_PROFILER = False
|
||||
RUN_PLASMA_STORE_PROFILER = False
|
||||
@@ -74,6 +77,10 @@ CREDIS_MEMBER_MODULE = os.path.join(
|
||||
os.path.abspath(os.path.dirname(__file__)),
|
||||
"core/src/credis/build/src/libmember.so")
|
||||
|
||||
# Location of the raylet executable.
|
||||
RAYLET_EXECUTABLE = os.path.join(
|
||||
os.path.abspath(os.path.dirname(__file__)),
|
||||
"core/src/ray/raylet/raylet")
|
||||
|
||||
# ObjectStoreAddress tuples contain all information necessary to connect to an
|
||||
# object store. The fields are:
|
||||
@@ -123,8 +130,8 @@ def kill_process(p):
|
||||
if p.poll() is not None:
|
||||
# The process has already terminated.
|
||||
return True
|
||||
if any([RUN_LOCAL_SCHEDULER_PROFILER, RUN_PLASMA_MANAGER_PROFILER,
|
||||
RUN_PLASMA_STORE_PROFILER]):
|
||||
if any([RUN_RAYLET_PROFILER, RUN_LOCAL_SCHEDULER_PROFILER,
|
||||
RUN_PLASMA_MANAGER_PROFILER, RUN_PLASMA_STORE_PROFILER]):
|
||||
# Give process signal to write profiler data.
|
||||
os.kill(p.pid, signal.SIGINT)
|
||||
# Wait for profiling data to be written.
|
||||
@@ -860,12 +867,73 @@ def start_local_scheduler(redis_address,
|
||||
return local_scheduler_name
|
||||
|
||||
|
||||
def start_raylet(redis_address,
|
||||
node_ip_address,
|
||||
plasma_store_name,
|
||||
worker_path,
|
||||
stdout_file=None,
|
||||
stderr_file=None,
|
||||
cleanup=True):
|
||||
"""Start a raylet, which is a combined local scheduler and object manager.
|
||||
|
||||
Args:
|
||||
redis_address (str): The address of the Redis instance.
|
||||
node_ip_address (str): The IP address of the node that this local
|
||||
scheduler is running on.
|
||||
plasma_store_name (str): The name of the plasma store socket to connect
|
||||
to.
|
||||
worker_path (str): The path of the script to use when the local
|
||||
scheduler starts up new workers.
|
||||
stdout_file: A file handle opened for writing to redirect stdout to. If
|
||||
no redirection should happen, then this should be None.
|
||||
stderr_file: A file handle opened for writing to redirect stderr to. If
|
||||
no redirection should happen, then this should be None.
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true,
|
||||
then this process will be killed by serices.cleanup() when the
|
||||
Python process that imported services exits.
|
||||
|
||||
Returns:
|
||||
The raylet socket name.
|
||||
"""
|
||||
gcs_ip_address, gcs_port = redis_address.split(":")
|
||||
raylet_name = "/tmp/raylet{}".format(random_name())
|
||||
|
||||
# Create the command that the Raylet will use to start workers.
|
||||
start_worker_command = ("{} {} "
|
||||
"--node-ip-address={} "
|
||||
"--object-store-name={} "
|
||||
"--raylet-name={} "
|
||||
"--redis-address={}"
|
||||
.format(sys.executable,
|
||||
worker_path,
|
||||
node_ip_address,
|
||||
plasma_store_name,
|
||||
raylet_name,
|
||||
redis_address))
|
||||
|
||||
command = [RAYLET_EXECUTABLE,
|
||||
raylet_name,
|
||||
plasma_store_name,
|
||||
node_ip_address,
|
||||
gcs_ip_address,
|
||||
gcs_port,
|
||||
start_worker_command]
|
||||
pid = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
|
||||
|
||||
if cleanup:
|
||||
all_processes[PROCESS_TYPE_RAYLET].append(pid)
|
||||
record_log_files_in_redis(redis_address, node_ip_address,
|
||||
[stdout_file, stderr_file])
|
||||
|
||||
return raylet_name
|
||||
|
||||
|
||||
def start_objstore(node_ip_address, redis_address,
|
||||
object_manager_port=None, store_stdout_file=None,
|
||||
store_stderr_file=None, manager_stdout_file=None,
|
||||
manager_stderr_file=None, objstore_memory=None,
|
||||
cleanup=True, plasma_directory=None,
|
||||
huge_pages=False):
|
||||
huge_pages=False, use_raylet=False):
|
||||
"""This method starts an object store process.
|
||||
|
||||
Args:
|
||||
@@ -893,6 +961,8 @@ def start_objstore(node_ip_address, redis_address,
|
||||
be created.
|
||||
huge_pages: Boolean flag indicating whether to start the Object
|
||||
Store with hugetlbfs support. Requires plasma_directory.
|
||||
use_raylet: True if the new raylet code path should be used. This is
|
||||
not supported yet.
|
||||
|
||||
Return:
|
||||
A tuple of the Plasma store socket name, the Plasma manager socket
|
||||
@@ -936,33 +1006,41 @@ def start_objstore(node_ip_address, redis_address,
|
||||
plasma_directory=plasma_directory,
|
||||
huge_pages=huge_pages)
|
||||
# Start the plasma manager.
|
||||
if object_manager_port is not None:
|
||||
(plasma_manager_name, p2,
|
||||
plasma_manager_port) = ray.plasma.start_plasma_manager(
|
||||
plasma_store_name,
|
||||
redis_address,
|
||||
plasma_manager_port=object_manager_port,
|
||||
node_ip_address=node_ip_address,
|
||||
num_retries=1,
|
||||
run_profiler=RUN_PLASMA_MANAGER_PROFILER,
|
||||
stdout_file=manager_stdout_file,
|
||||
stderr_file=manager_stderr_file)
|
||||
assert plasma_manager_port == object_manager_port
|
||||
if not use_raylet:
|
||||
if object_manager_port is not None:
|
||||
(plasma_manager_name, p2,
|
||||
plasma_manager_port) = ray.plasma.start_plasma_manager(
|
||||
plasma_store_name,
|
||||
redis_address,
|
||||
plasma_manager_port=object_manager_port,
|
||||
node_ip_address=node_ip_address,
|
||||
num_retries=1,
|
||||
run_profiler=RUN_PLASMA_MANAGER_PROFILER,
|
||||
stdout_file=manager_stdout_file,
|
||||
stderr_file=manager_stderr_file)
|
||||
assert plasma_manager_port == object_manager_port
|
||||
else:
|
||||
(plasma_manager_name, p2,
|
||||
plasma_manager_port) = ray.plasma.start_plasma_manager(
|
||||
plasma_store_name,
|
||||
redis_address,
|
||||
node_ip_address=node_ip_address,
|
||||
run_profiler=RUN_PLASMA_MANAGER_PROFILER,
|
||||
stdout_file=manager_stdout_file,
|
||||
stderr_file=manager_stderr_file)
|
||||
else:
|
||||
(plasma_manager_name, p2,
|
||||
plasma_manager_port) = ray.plasma.start_plasma_manager(
|
||||
plasma_store_name,
|
||||
redis_address,
|
||||
node_ip_address=node_ip_address,
|
||||
run_profiler=RUN_PLASMA_MANAGER_PROFILER,
|
||||
stdout_file=manager_stdout_file,
|
||||
stderr_file=manager_stderr_file)
|
||||
plasma_manager_port = None
|
||||
plasma_manager_name = None
|
||||
|
||||
if cleanup:
|
||||
all_processes[PROCESS_TYPE_PLASMA_STORE].append(p1)
|
||||
all_processes[PROCESS_TYPE_PLASMA_MANAGER].append(p2)
|
||||
record_log_files_in_redis(redis_address, node_ip_address,
|
||||
[store_stdout_file, store_stderr_file,
|
||||
manager_stdout_file, manager_stderr_file])
|
||||
[store_stdout_file, store_stderr_file])
|
||||
if not use_raylet:
|
||||
if cleanup:
|
||||
all_processes[PROCESS_TYPE_PLASMA_MANAGER].append(p2)
|
||||
record_log_files_in_redis(redis_address, node_ip_address,
|
||||
[manager_stdout_file, manager_stderr_file])
|
||||
|
||||
return ObjectStoreAddress(plasma_store_name, plasma_manager_name,
|
||||
plasma_manager_port)
|
||||
@@ -1059,7 +1137,8 @@ def start_ray_processes(address_info=None,
|
||||
resources=None,
|
||||
plasma_directory=None,
|
||||
huge_pages=False,
|
||||
autoscaling_config=None):
|
||||
autoscaling_config=None,
|
||||
use_raylet=False):
|
||||
"""Helper method to start Ray processes.
|
||||
|
||||
Args:
|
||||
@@ -1112,6 +1191,8 @@ def start_ray_processes(address_info=None,
|
||||
huge_pages: Boolean flag indicating whether to start the Object
|
||||
Store with hugetlbfs support. Requires plasma_directory.
|
||||
autoscaling_config: path to autoscaling config file.
|
||||
use_raylet: True if the new raylet code path should be used. This is
|
||||
not supported yet.
|
||||
|
||||
Returns:
|
||||
A dictionary of the address information for the processes that were
|
||||
@@ -1193,7 +1274,7 @@ def start_ray_processes(address_info=None,
|
||||
cleanup=cleanup)
|
||||
|
||||
# Start the global scheduler, if necessary.
|
||||
if include_global_scheduler:
|
||||
if include_global_scheduler and not use_raylet:
|
||||
global_scheduler_stdout_file, global_scheduler_stderr_file = (
|
||||
new_log_files("global_scheduler", redirect_output))
|
||||
start_global_scheduler(redis_address,
|
||||
@@ -1235,71 +1316,90 @@ def start_ray_processes(address_info=None,
|
||||
manager_stderr_file=plasma_manager_stderr_file,
|
||||
objstore_memory=object_store_memory,
|
||||
cleanup=cleanup, plasma_directory=plasma_directory,
|
||||
huge_pages=huge_pages)
|
||||
huge_pages=huge_pages,
|
||||
use_raylet=use_raylet)
|
||||
object_store_addresses.append(object_store_address)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Start any local schedulers that do not yet exist.
|
||||
for i in range(len(local_scheduler_socket_names), num_local_schedulers):
|
||||
# Connect the local scheduler to the object store at the same index.
|
||||
object_store_address = object_store_addresses[i]
|
||||
plasma_address = "{}:{}".format(node_ip_address,
|
||||
object_store_address.manager_port)
|
||||
# Determine how many workers this local scheduler should start.
|
||||
if start_workers_from_local_scheduler:
|
||||
num_local_scheduler_workers = workers_per_local_scheduler[i]
|
||||
workers_per_local_scheduler[i] = 0
|
||||
else:
|
||||
# If we're starting the workers from Python, the local scheduler
|
||||
# should not start any workers.
|
||||
num_local_scheduler_workers = 0
|
||||
# Start the local scheduler. Note that if we do not wish to redirect
|
||||
# the worker output, then we cannot redirect the local scheduler
|
||||
# output.
|
||||
local_scheduler_stdout_file, local_scheduler_stderr_file = (
|
||||
new_log_files("local_scheduler_{}".format(i),
|
||||
redirect_output=redirect_worker_output))
|
||||
local_scheduler_name = start_local_scheduler(
|
||||
if not use_raylet:
|
||||
for i in range(len(local_scheduler_socket_names),
|
||||
num_local_schedulers):
|
||||
# Connect the local scheduler to the object store at the same
|
||||
# index.
|
||||
object_store_address = object_store_addresses[i]
|
||||
plasma_address = "{}:{}".format(node_ip_address,
|
||||
object_store_address.manager_port)
|
||||
# Determine how many workers this local scheduler should start.
|
||||
if start_workers_from_local_scheduler:
|
||||
num_local_scheduler_workers = workers_per_local_scheduler[i]
|
||||
workers_per_local_scheduler[i] = 0
|
||||
else:
|
||||
# If we're starting the workers from Python, the local
|
||||
# scheduler should not start any workers.
|
||||
num_local_scheduler_workers = 0
|
||||
# Start the local scheduler. Note that if we do not wish to
|
||||
# redirect the worker output, then we cannot redirect the local
|
||||
# scheduler output.
|
||||
local_scheduler_stdout_file, local_scheduler_stderr_file = (
|
||||
new_log_files("local_scheduler_{}".format(i),
|
||||
redirect_output=redirect_worker_output))
|
||||
local_scheduler_name = start_local_scheduler(
|
||||
redis_address,
|
||||
node_ip_address,
|
||||
object_store_address.name,
|
||||
object_store_address.manager_name,
|
||||
worker_path,
|
||||
plasma_address=plasma_address,
|
||||
stdout_file=local_scheduler_stdout_file,
|
||||
stderr_file=local_scheduler_stderr_file,
|
||||
cleanup=cleanup,
|
||||
resources=resources[i],
|
||||
num_workers=num_local_scheduler_workers)
|
||||
local_scheduler_socket_names.append(local_scheduler_name)
|
||||
|
||||
# Make sure that we have exactly num_local_schedulers instances of
|
||||
# object stores and local schedulers.
|
||||
assert len(object_store_addresses) == num_local_schedulers
|
||||
assert len(local_scheduler_socket_names) == num_local_schedulers
|
||||
|
||||
else:
|
||||
# Start the raylet. TODO(rkn): Modify this to allow starting
|
||||
# multiple raylets on the same machine.
|
||||
raylet_stdout_file, raylet_stderr_file = (
|
||||
new_log_files("raylet_{}".format(i),
|
||||
redirect_output=redirect_output))
|
||||
address_info["raylet_socket_name"] = start_raylet(
|
||||
redis_address,
|
||||
node_ip_address,
|
||||
object_store_address.name,
|
||||
object_store_address.manager_name,
|
||||
object_store_addresses[i].name,
|
||||
worker_path,
|
||||
plasma_address=plasma_address,
|
||||
stdout_file=local_scheduler_stdout_file,
|
||||
stderr_file=local_scheduler_stderr_file,
|
||||
cleanup=cleanup,
|
||||
resources=resources[i],
|
||||
num_workers=num_local_scheduler_workers)
|
||||
local_scheduler_socket_names.append(local_scheduler_name)
|
||||
time.sleep(0.1)
|
||||
stdout_file=None,
|
||||
stderr_file=None,
|
||||
cleanup=cleanup)
|
||||
|
||||
# Make sure that we have exactly num_local_schedulers instances of object
|
||||
# stores and local schedulers.
|
||||
assert len(object_store_addresses) == num_local_schedulers
|
||||
assert len(local_scheduler_socket_names) == num_local_schedulers
|
||||
if not use_raylet:
|
||||
# Start any workers that the local scheduler has not already started.
|
||||
for i, num_local_scheduler_workers in enumerate(
|
||||
workers_per_local_scheduler):
|
||||
object_store_address = object_store_addresses[i]
|
||||
local_scheduler_name = local_scheduler_socket_names[i]
|
||||
for j in range(num_local_scheduler_workers):
|
||||
worker_stdout_file, worker_stderr_file = new_log_files(
|
||||
"worker_{}_{}".format(i, j), redirect_output)
|
||||
start_worker(node_ip_address,
|
||||
object_store_address.name,
|
||||
object_store_address.manager_name,
|
||||
local_scheduler_name,
|
||||
redis_address,
|
||||
worker_path,
|
||||
stdout_file=worker_stdout_file,
|
||||
stderr_file=worker_stderr_file,
|
||||
cleanup=cleanup)
|
||||
workers_per_local_scheduler[i] -= 1
|
||||
|
||||
# Start any workers that the local scheduler has not already started.
|
||||
for i, num_local_scheduler_workers in enumerate(
|
||||
workers_per_local_scheduler):
|
||||
object_store_address = object_store_addresses[i]
|
||||
local_scheduler_name = local_scheduler_socket_names[i]
|
||||
for j in range(num_local_scheduler_workers):
|
||||
worker_stdout_file, worker_stderr_file = new_log_files(
|
||||
"worker_{}_{}".format(i, j), redirect_output)
|
||||
start_worker(node_ip_address,
|
||||
object_store_address.name,
|
||||
object_store_address.manager_name,
|
||||
local_scheduler_name,
|
||||
redis_address,
|
||||
worker_path,
|
||||
stdout_file=worker_stdout_file,
|
||||
stderr_file=worker_stderr_file,
|
||||
cleanup=cleanup)
|
||||
workers_per_local_scheduler[i] -= 1
|
||||
|
||||
# Make sure that we've started all the workers.
|
||||
assert(sum(workers_per_local_scheduler) == 0)
|
||||
# Make sure that we've started all the workers.
|
||||
assert(sum(workers_per_local_scheduler) == 0)
|
||||
|
||||
# Try to start the web UI.
|
||||
if include_webui:
|
||||
@@ -1327,7 +1427,8 @@ def start_ray_node(node_ip_address,
|
||||
redirect_output=False,
|
||||
resources=None,
|
||||
plasma_directory=None,
|
||||
huge_pages=False):
|
||||
huge_pages=False,
|
||||
use_raylet=False):
|
||||
"""Start the Ray processes for a single node.
|
||||
|
||||
This assumes that the Ray processes on some master node have already been
|
||||
@@ -1360,6 +1461,8 @@ def start_ray_node(node_ip_address,
|
||||
be created.
|
||||
huge_pages: Boolean flag indicating whether to start the Object
|
||||
Store with hugetlbfs support. Requires plasma_directory.
|
||||
use_raylet: True if the new raylet code path should be used. This is
|
||||
not supported yet.
|
||||
|
||||
Returns:
|
||||
A dictionary of the address information for the processes that were
|
||||
@@ -1400,7 +1503,8 @@ def start_ray_head(address_info=None,
|
||||
include_webui=True,
|
||||
plasma_directory=None,
|
||||
huge_pages=False,
|
||||
autoscaling_config=None):
|
||||
autoscaling_config=None,
|
||||
use_raylet=False):
|
||||
"""Start Ray in local mode.
|
||||
|
||||
Args:
|
||||
@@ -1447,6 +1551,8 @@ def start_ray_head(address_info=None,
|
||||
huge_pages: Boolean flag indicating whether to start the Object
|
||||
Store with hugetlbfs support. Requires plasma_directory.
|
||||
autoscaling_config: path to autoscaling config file.
|
||||
use_raylet: True if the new raylet code path should be used. This is
|
||||
not supported yet.
|
||||
|
||||
Returns:
|
||||
A dictionary of the address information for the processes that were
|
||||
@@ -1474,7 +1580,8 @@ def start_ray_head(address_info=None,
|
||||
redis_max_clients=redis_max_clients,
|
||||
plasma_directory=plasma_directory,
|
||||
huge_pages=huge_pages,
|
||||
autoscaling_config=autoscaling_config)
|
||||
autoscaling_config=autoscaling_config,
|
||||
use_raylet=use_raylet)
|
||||
|
||||
|
||||
def try_to_create_directory(directory_path):
|
||||
|
||||
+160
-78
@@ -31,6 +31,9 @@ import ray.plasma
|
||||
from ray.utils import (FunctionProperties, random_string, binary_to_hex,
|
||||
is_cython)
|
||||
|
||||
# Import flatbuffer bindings.
|
||||
from ray.core.generated.ClientTableData import ClientTableData
|
||||
|
||||
SCRIPT_MODE = 0
|
||||
WORKER_MODE = 1
|
||||
PYTHON_MODE = 2
|
||||
@@ -50,6 +53,7 @@ NIL_LOCAL_SCHEDULER_ID = NIL_ID
|
||||
NIL_FUNCTION_ID = NIL_ID
|
||||
NIL_ACTOR_ID = NIL_ID
|
||||
NIL_ACTOR_HANDLE_ID = NIL_ID
|
||||
NIL_CLIENT_ID = 20 * b"\xff"
|
||||
|
||||
# This must be kept in sync with the `error_types` array in
|
||||
# common/state/error_table.h.
|
||||
@@ -452,9 +456,12 @@ class Worker(object):
|
||||
for object_id in object_ids]
|
||||
for i in range(0, len(object_ids),
|
||||
ray._config.worker_fetch_request_size()):
|
||||
self.plasma_client.fetch(
|
||||
plain_object_ids[i:(i +
|
||||
ray._config.worker_fetch_request_size())])
|
||||
if not self.use_raylet:
|
||||
self.plasma_client.fetch(
|
||||
plain_object_ids
|
||||
[i:(i + ray._config.worker_fetch_request_size())])
|
||||
else:
|
||||
print("plasma_client.fetch has not been implemented yet")
|
||||
|
||||
# Get the objects. We initially try to get the objects immediately.
|
||||
final_results = self.retrieve_and_deserialize(plain_object_ids, 0)
|
||||
@@ -478,9 +485,12 @@ class Worker(object):
|
||||
plasma.ObjectID, unready_ids.keys()))
|
||||
for i in range(0, len(object_ids_to_fetch),
|
||||
ray._config.worker_fetch_request_size()):
|
||||
self.plasma_client.fetch(
|
||||
object_ids_to_fetch[i:(
|
||||
i + ray._config.worker_fetch_request_size())])
|
||||
if not self.use_raylet:
|
||||
self.plasma_client.fetch(
|
||||
object_ids_to_fetch[i:(
|
||||
i + ray._config.worker_fetch_request_size())])
|
||||
else:
|
||||
print("plasma_client.fetch has not been implemented yet")
|
||||
results = self.retrieve_and_deserialize(
|
||||
object_ids_to_fetch,
|
||||
max([ray._config.get_timeout_milliseconds(),
|
||||
@@ -496,7 +506,7 @@ class Worker(object):
|
||||
|
||||
# If there were objects that we weren't able to get locally, let the
|
||||
# local scheduler know that we're now unblocked.
|
||||
if was_blocked:
|
||||
if was_blocked and not self.use_raylet:
|
||||
self.local_scheduler_client.notify_unblocked()
|
||||
|
||||
assert len(final_results) == len(object_ids)
|
||||
@@ -1150,70 +1160,108 @@ def _initialize_serialization(worker=global_worker):
|
||||
use_dict=True)
|
||||
|
||||
|
||||
def get_address_info_from_redis_helper(redis_address, node_ip_address):
|
||||
def get_address_info_from_redis_helper(redis_address, node_ip_address,
|
||||
use_raylet=False):
|
||||
redis_ip_address, redis_port = redis_address.split(":")
|
||||
# For this command to work, some other client (on the same machine as
|
||||
# Redis) must have run "CONFIG SET protected-mode no".
|
||||
redis_client = redis.StrictRedis(host=redis_ip_address,
|
||||
port=int(redis_port))
|
||||
# The client table prefix must be kept in sync with the file
|
||||
# "src/common/redis_module/ray_redis_module.cc" where it is defined.
|
||||
REDIS_CLIENT_TABLE_PREFIX = "CL:"
|
||||
client_keys = redis_client.keys("{}*".format(REDIS_CLIENT_TABLE_PREFIX))
|
||||
# Filter to live clients on the same node and do some basic checking.
|
||||
plasma_managers = []
|
||||
local_schedulers = []
|
||||
for key in client_keys:
|
||||
info = redis_client.hgetall(key)
|
||||
|
||||
# Ignore clients that were deleted.
|
||||
deleted = info[b"deleted"]
|
||||
deleted = bool(int(deleted))
|
||||
if deleted:
|
||||
continue
|
||||
if not use_raylet:
|
||||
# The client table prefix must be kept in sync with the file
|
||||
# "src/common/redis_module/ray_redis_module.cc" where it is defined.
|
||||
REDIS_CLIENT_TABLE_PREFIX = "CL:"
|
||||
client_keys = redis_client.keys(
|
||||
"{}*".format(REDIS_CLIENT_TABLE_PREFIX))
|
||||
# Filter to live clients on the same node and do some basic checking.
|
||||
plasma_managers = []
|
||||
local_schedulers = []
|
||||
for key in client_keys:
|
||||
info = redis_client.hgetall(key)
|
||||
|
||||
assert b"ray_client_id" in info
|
||||
assert b"node_ip_address" in info
|
||||
assert b"client_type" in info
|
||||
client_node_ip_address = info[b"node_ip_address"].decode("ascii")
|
||||
if (client_node_ip_address == node_ip_address or
|
||||
(client_node_ip_address == "127.0.0.1" and
|
||||
redis_ip_address == ray.services.get_node_ip_address())):
|
||||
if info[b"client_type"].decode("ascii") == "plasma_manager":
|
||||
plasma_managers.append(info)
|
||||
elif info[b"client_type"].decode("ascii") == "local_scheduler":
|
||||
local_schedulers.append(info)
|
||||
# Make sure that we got at least one plasma manager and local scheduler.
|
||||
assert len(plasma_managers) >= 1
|
||||
assert len(local_schedulers) >= 1
|
||||
# Build the address information.
|
||||
object_store_addresses = []
|
||||
for manager in plasma_managers:
|
||||
address = manager[b"manager_address"].decode("ascii")
|
||||
port = services.get_port(address)
|
||||
object_store_addresses.append(
|
||||
services.ObjectStoreAddress(
|
||||
name=manager[b"store_socket_name"].decode("ascii"),
|
||||
manager_name=manager[b"manager_socket_name"].decode("ascii"),
|
||||
manager_port=port))
|
||||
scheduler_names = [
|
||||
scheduler[b"local_scheduler_socket_name"].decode("ascii")
|
||||
for scheduler in local_schedulers]
|
||||
client_info = {"node_ip_address": node_ip_address,
|
||||
"redis_address": redis_address,
|
||||
"object_store_addresses": object_store_addresses,
|
||||
"local_scheduler_socket_names": scheduler_names,
|
||||
# Web UI should be running.
|
||||
"webui_url": _webui_url_helper(redis_client)}
|
||||
return client_info
|
||||
# Ignore clients that were deleted.
|
||||
deleted = info[b"deleted"]
|
||||
deleted = bool(int(deleted))
|
||||
if deleted:
|
||||
continue
|
||||
|
||||
assert b"ray_client_id" in info
|
||||
assert b"node_ip_address" in info
|
||||
assert b"client_type" in info
|
||||
client_node_ip_address = info[b"node_ip_address"].decode("ascii")
|
||||
if (client_node_ip_address == node_ip_address or
|
||||
(client_node_ip_address == "127.0.0.1" and
|
||||
redis_ip_address == ray.services.get_node_ip_address())):
|
||||
if info[b"client_type"].decode("ascii") == "plasma_manager":
|
||||
plasma_managers.append(info)
|
||||
elif info[b"client_type"].decode("ascii") == "local_scheduler":
|
||||
local_schedulers.append(info)
|
||||
# Make sure that we got at least one plasma manager and local
|
||||
# scheduler.
|
||||
assert len(plasma_managers) >= 1
|
||||
assert len(local_schedulers) >= 1
|
||||
# Build the address information.
|
||||
object_store_addresses = []
|
||||
for manager in plasma_managers:
|
||||
address = manager[b"manager_address"].decode("ascii")
|
||||
port = services.get_port(address)
|
||||
object_store_addresses.append(
|
||||
services.ObjectStoreAddress(
|
||||
name=manager[b"store_socket_name"].decode("ascii"),
|
||||
manager_name=manager[b"manager_socket_name"].decode(
|
||||
"ascii"),
|
||||
manager_port=port))
|
||||
scheduler_names = [
|
||||
scheduler[b"local_scheduler_socket_name"].decode("ascii")
|
||||
for scheduler in local_schedulers]
|
||||
client_info = {"node_ip_address": node_ip_address,
|
||||
"redis_address": redis_address,
|
||||
"object_store_addresses": object_store_addresses,
|
||||
"local_scheduler_socket_names": scheduler_names,
|
||||
# Web UI should be running.
|
||||
"webui_url": _webui_url_helper(redis_client)}
|
||||
return client_info
|
||||
|
||||
# Handle the raylet case.
|
||||
else:
|
||||
# In the raylet code path, all client data is stored in a zset at the
|
||||
# key for the nil client.
|
||||
client_key = b"CLIENT:" + NIL_CLIENT_ID
|
||||
clients = redis_client.zrange(client_key, 0, -1)
|
||||
raylets = []
|
||||
for client_message in clients:
|
||||
client = ClientTableData.GetRootAsClientTableData(client_message,
|
||||
0)
|
||||
client_node_ip_address = client.NodeManagerAddress().decode(
|
||||
"ascii")
|
||||
if (client_node_ip_address == node_ip_address or
|
||||
(client_node_ip_address == "127.0.0.1" and
|
||||
redis_ip_address == ray.services.get_node_ip_address())):
|
||||
raylets.append(client)
|
||||
|
||||
# TODO(rkn): The ObjectStoreSocketName field does not exist.
|
||||
object_store_addresses = [
|
||||
raylet.ObjectStoreSocketName().decode("ascii")
|
||||
for raylet in raylets]
|
||||
raylet_socket_names = [raylet.NodeManagerAddress().decode("ascii") for
|
||||
raylet in raylets]
|
||||
return {"node_ip_address": node_ip_address,
|
||||
"redis_address": redis_address,
|
||||
"object_store_addresses": object_store_addresses,
|
||||
"raylet_socket_names": raylet_socket_names,
|
||||
# Web UI should be running.
|
||||
"webui_url": _webui_url_helper(redis_client)}
|
||||
|
||||
|
||||
def get_address_info_from_redis(redis_address, node_ip_address, num_retries=5):
|
||||
def get_address_info_from_redis(redis_address, node_ip_address, num_retries=5,
|
||||
use_raylet=False):
|
||||
counter = 0
|
||||
while True:
|
||||
try:
|
||||
return get_address_info_from_redis_helper(redis_address,
|
||||
node_ip_address)
|
||||
node_ip_address,
|
||||
use_raylet=use_raylet)
|
||||
except Exception as e:
|
||||
if counter == num_retries:
|
||||
raise
|
||||
@@ -1281,7 +1329,8 @@ def _init(address_info=None,
|
||||
redis_max_clients=None,
|
||||
plasma_directory=None,
|
||||
huge_pages=False,
|
||||
include_webui=True):
|
||||
include_webui=True,
|
||||
use_raylet=False):
|
||||
"""Helper method to connect to an existing Ray cluster or start a new one.
|
||||
|
||||
This method handles two cases. Either a Ray cluster already exists and we
|
||||
@@ -1336,6 +1385,8 @@ def _init(address_info=None,
|
||||
Store with hugetlbfs support. Requires plasma_directory.
|
||||
include_webui: Boolean flag indicating whether to start the web
|
||||
UI, which is a Jupyter notebook.
|
||||
use_raylet: True if the new raylet code path should be used. This is
|
||||
not supported yet.
|
||||
|
||||
Returns:
|
||||
Address information about the started processes.
|
||||
@@ -1402,7 +1453,8 @@ def _init(address_info=None,
|
||||
redis_max_clients=redis_max_clients,
|
||||
plasma_directory=plasma_directory,
|
||||
huge_pages=huge_pages,
|
||||
include_webui=include_webui)
|
||||
include_webui=include_webui,
|
||||
use_raylet=use_raylet)
|
||||
else:
|
||||
if redis_address is None:
|
||||
raise Exception("When connecting to an existing cluster, "
|
||||
@@ -1439,7 +1491,8 @@ def _init(address_info=None,
|
||||
node_ip_address = services.get_node_ip_address(redis_address)
|
||||
# Get the address info of the processes to connect to from Redis.
|
||||
address_info = get_address_info_from_redis(redis_address,
|
||||
node_ip_address)
|
||||
node_ip_address,
|
||||
use_raylet=use_raylet)
|
||||
|
||||
# Connect this driver to Redis, the object store, and the local scheduler.
|
||||
# Choose the first object store and local scheduler if there are multiple.
|
||||
@@ -1453,13 +1506,17 @@ def _init(address_info=None,
|
||||
"redis_address": address_info["redis_address"],
|
||||
"store_socket_name": (
|
||||
address_info["object_store_addresses"][0].name),
|
||||
"manager_socket_name": (
|
||||
address_info["object_store_addresses"][0].manager_name),
|
||||
"local_scheduler_socket_name": (
|
||||
address_info["local_scheduler_socket_names"][0]),
|
||||
"webui_url": address_info["webui_url"]}
|
||||
if not use_raylet:
|
||||
driver_address_info["manager_socket_name"] = (
|
||||
address_info["object_store_addresses"][0].manager_name)
|
||||
driver_address_info["local_scheduler_socket_name"] = (
|
||||
address_info["local_scheduler_socket_names"][0])
|
||||
else:
|
||||
driver_address_info["raylet_socket_name"] = (
|
||||
address_info["raylet_socket_name"])
|
||||
connect(driver_address_info, object_id_seed=object_id_seed,
|
||||
mode=driver_mode, worker=global_worker)
|
||||
mode=driver_mode, worker=global_worker, use_raylet=use_raylet)
|
||||
return address_info
|
||||
|
||||
|
||||
@@ -1469,7 +1526,8 @@ def init(redis_address=None, node_ip_address=None, object_id_seed=None,
|
||||
num_cpus=None, num_gpus=None, resources=None,
|
||||
num_custom_resource=None, num_redis_shards=None,
|
||||
redis_max_clients=None, plasma_directory=None,
|
||||
huge_pages=False, include_webui=True, object_store_memory=None):
|
||||
huge_pages=False, include_webui=True, object_store_memory=None,
|
||||
use_raylet=False):
|
||||
"""Connect to an existing Ray cluster or start one and connect to it.
|
||||
|
||||
This method handles two cases. Either a Ray cluster already exists and we
|
||||
@@ -1513,6 +1571,9 @@ def init(redis_address=None, node_ip_address=None, object_id_seed=None,
|
||||
UI, which is a Jupyter notebook.
|
||||
object_store_memory: The amount of memory (in bytes) to start the
|
||||
object store with.
|
||||
use_raylet: True if the new raylet code path should be used. This is
|
||||
not supported yet.
|
||||
|
||||
|
||||
Returns:
|
||||
Address information about the started processes.
|
||||
@@ -1539,7 +1600,8 @@ def init(redis_address=None, node_ip_address=None, object_id_seed=None,
|
||||
plasma_directory=plasma_directory,
|
||||
huge_pages=huge_pages,
|
||||
include_webui=include_webui,
|
||||
object_store_memory=object_store_memory)
|
||||
object_store_memory=object_store_memory,
|
||||
use_raylet=use_raylet)
|
||||
|
||||
|
||||
def cleanup(worker=global_worker):
|
||||
@@ -1818,7 +1880,8 @@ def import_thread(worker, mode):
|
||||
pass
|
||||
|
||||
|
||||
def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
|
||||
def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker,
|
||||
use_raylet=False):
|
||||
"""Connect this worker to the local scheduler, to Plasma, and to Redis.
|
||||
|
||||
Args:
|
||||
@@ -1828,6 +1891,8 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
|
||||
deterministic.
|
||||
mode: The mode of the worker. One of SCRIPT_MODE, WORKER_MODE,
|
||||
PYTHON_MODE, and SILENT_MODE.
|
||||
use_raylet: True if the new raylet code path should be used. This is
|
||||
not supported yet.
|
||||
"""
|
||||
check_main_thread()
|
||||
# Do some basic checking to make sure we didn't call ray.init twice.
|
||||
@@ -1842,6 +1907,7 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
|
||||
worker.actor_id = NIL_ACTOR_ID
|
||||
worker.connected = True
|
||||
worker.set_mode(mode)
|
||||
worker.use_raylet = use_raylet
|
||||
# The worker.events field is used to aggregate logging information and
|
||||
# display it in the web UI. Note that Python lists protected by the GIL,
|
||||
# which is important because we will append to this field from multiple
|
||||
@@ -1909,8 +1975,9 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
|
||||
"driver_id": worker.worker_id,
|
||||
"start_time": time.time(),
|
||||
"plasma_store_socket": info["store_socket_name"],
|
||||
"plasma_manager_socket": info["manager_socket_name"],
|
||||
"local_scheduler_socket": info["local_scheduler_socket_name"]}
|
||||
"plasma_manager_socket": info.get("manager_socket_name"),
|
||||
"local_scheduler_socket": info.get("local_scheduler_socket_name"),
|
||||
"raylet_socket": info.get("raylet_socket_name")}
|
||||
driver_info["name"] = (main.__file__ if hasattr(main, "__file__")
|
||||
else "INTERACTIVE MODE")
|
||||
worker.redis_client.hmset(b"Drivers:" + worker.worker_id, driver_info)
|
||||
@@ -1933,11 +2000,22 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
|
||||
raise Exception("This code should be unreachable.")
|
||||
|
||||
# Create an object store client.
|
||||
worker.plasma_client = plasma.connect(info["store_socket_name"],
|
||||
info["manager_socket_name"],
|
||||
64)
|
||||
if not worker.use_raylet:
|
||||
worker.plasma_client = plasma.connect(info["store_socket_name"],
|
||||
info["manager_socket_name"],
|
||||
64)
|
||||
else:
|
||||
worker.plasma_client = plasma.connect(info["store_socket_name"],
|
||||
"",
|
||||
64)
|
||||
|
||||
if not worker.use_raylet:
|
||||
local_scheduler_socket = info["local_scheduler_socket_name"]
|
||||
else:
|
||||
local_scheduler_socket = info["raylet_socket_name"]
|
||||
|
||||
worker.local_scheduler_client = ray.local_scheduler.LocalSchedulerClient(
|
||||
info["local_scheduler_socket_name"], worker.worker_id, is_worker)
|
||||
local_scheduler_socket, worker.worker_id, is_worker)
|
||||
|
||||
# If this is a driver, set the current task ID, the task driver ID, and set
|
||||
# the task index to 0.
|
||||
@@ -2275,9 +2353,10 @@ def flush_log(worker=global_worker):
|
||||
"""Send the logged worker events to the global state store."""
|
||||
event_log_key = b"event_log:" + worker.worker_id
|
||||
event_log_value = json.dumps(worker.events)
|
||||
worker.local_scheduler_client.log_event(event_log_key,
|
||||
event_log_value,
|
||||
time.time())
|
||||
if not worker.use_raylet:
|
||||
worker.local_scheduler_client.log_event(event_log_key,
|
||||
event_log_value,
|
||||
time.time())
|
||||
worker.events = []
|
||||
|
||||
|
||||
@@ -2367,6 +2446,9 @@ def wait(object_ids, num_returns=1, timeout=None, worker=global_worker):
|
||||
A list of object IDs that are ready and a list of the remaining object
|
||||
IDs.
|
||||
"""
|
||||
if worker.use_raylet:
|
||||
print("plasma_client.wait has not been implemented yet")
|
||||
return
|
||||
|
||||
if isinstance(object_ids, ray.local_scheduler.ObjectID):
|
||||
raise TypeError(
|
||||
|
||||
@@ -16,10 +16,12 @@ parser.add_argument("--redis-address", required=True, type=str,
|
||||
help="the address to use for Redis")
|
||||
parser.add_argument("--object-store-name", required=True, type=str,
|
||||
help="the object store's name")
|
||||
parser.add_argument("--object-store-manager-name", required=True, type=str,
|
||||
parser.add_argument("--object-store-manager-name", required=False, type=str,
|
||||
help="the object store manager's name")
|
||||
parser.add_argument("--local-scheduler-name", required=True, type=str,
|
||||
parser.add_argument("--local-scheduler-name", required=False, type=str,
|
||||
help="the local scheduler's name")
|
||||
parser.add_argument("--raylet-name", required=False, type=str,
|
||||
help="the raylet's name")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -29,9 +31,11 @@ if __name__ == "__main__":
|
||||
"redis_address": args.redis_address,
|
||||
"store_socket_name": args.object_store_name,
|
||||
"manager_socket_name": args.object_store_manager_name,
|
||||
"local_scheduler_socket_name": args.local_scheduler_name}
|
||||
"local_scheduler_socket_name": args.local_scheduler_name,
|
||||
"raylet_socket_name": args.raylet_name}
|
||||
|
||||
ray.worker.connect(info, mode=ray.WORKER_MODE)
|
||||
ray.worker.connect(info, mode=ray.WORKER_MODE,
|
||||
use_raylet=(args.raylet_name is not None))
|
||||
|
||||
error_explanation = """
|
||||
This error is unexpected and should not have happened. Somehow a worker
|
||||
|
||||
@@ -23,6 +23,7 @@ ray_files = [
|
||||
"ray/core/src/local_scheduler/local_scheduler",
|
||||
"ray/core/src/local_scheduler/liblocal_scheduler_library.so",
|
||||
"ray/core/src/global_scheduler/global_scheduler",
|
||||
"ray/core/src/ray/raylet/raylet",
|
||||
"ray/WebUI.ipynb"
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user