[xray] Integrate worker.py with raylet. (#1810)

* Integrate worker with raylet.

* Begin allowing worker to attach to cluster.

* Fix linting and documentation.

* Fix linting.

* Comment tests back in.

* Fix type of worker command.

* Remove xray python files and tests.

* Fix from rebase.

* Add test.

* Copy over raylet executable.

* Small cleanup.
This commit is contained in:
Robert Nishihara
2018-04-03 02:38:56 -07:00
committed by Philipp Moritz
parent 0fc989c6c1
commit fbfbb1c079
22 changed files with 459 additions and 506 deletions
+8 -4
View File
@@ -86,11 +86,13 @@ def cli():
help="enable support for huge pages in the object store")
@click.option("--autoscaling-config", required=False, type=str,
help="the file that contains the autoscaling config")
@click.option("--use-raylet", is_flag=True, default=False,
help="use the raylet code path, this is not supported yet")
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
redis_max_clients, redis_shard_ports, object_manager_port,
object_store_memory, num_workers, num_cpus, num_gpus, resources,
head, no_ui, block, plasma_directory, huge_pages,
autoscaling_config):
autoscaling_config, use_raylet):
# Convert hostnames to numerical IP address.
if node_ip_address is not None:
node_ip_address = services.address_to_ip(node_ip_address)
@@ -161,7 +163,8 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
include_webui=(not no_ui),
plasma_directory=plasma_directory,
huge_pages=huge_pages,
autoscaling_config=autoscaling_config)
autoscaling_config=autoscaling_config,
use_raylet=use_raylet)
print(address_info)
print("\nStarted Ray on this node. You can add additional nodes to "
"the cluster by calling\n\n"
@@ -227,7 +230,8 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
redirect_output=True,
resources=resources,
plasma_directory=plasma_directory,
huge_pages=huge_pages)
huge_pages=huge_pages,
use_raylet=use_raylet)
print(address_info)
print("\nStarted Ray on this node. If you wish to terminate the "
"processes that have been started, run\n\n"
@@ -242,7 +246,7 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
@click.command()
def stop():
subprocess.call(["killall global_scheduler plasma_store plasma_manager "
"local_scheduler"], shell=True)
"local_scheduler raylet"], shell=True)
# Find the PID of the monitor process and kill it.
subprocess.call(["kill $(ps aux | grep monitor.py | grep -v grep | "
+194 -87
View File
@@ -28,6 +28,7 @@ import ray.global_scheduler as global_scheduler
PROCESS_TYPE_MONITOR = "monitor"
PROCESS_TYPE_LOG_MONITOR = "log_monitor"
PROCESS_TYPE_WORKER = "worker"
PROCESS_TYPE_RAYLET = "raylet"
PROCESS_TYPE_LOCAL_SCHEDULER = "local_scheduler"
PROCESS_TYPE_PLASMA_MANAGER = "plasma_manager"
PROCESS_TYPE_PLASMA_STORE = "plasma_store"
@@ -43,6 +44,7 @@ PROCESS_TYPE_WEB_UI = "web_ui"
all_processes = OrderedDict([(PROCESS_TYPE_MONITOR, []),
(PROCESS_TYPE_LOG_MONITOR, []),
(PROCESS_TYPE_WORKER, []),
(PROCESS_TYPE_RAYLET, []),
(PROCESS_TYPE_LOCAL_SCHEDULER, []),
(PROCESS_TYPE_PLASMA_MANAGER, []),
(PROCESS_TYPE_PLASMA_STORE, []),
@@ -51,6 +53,7 @@ all_processes = OrderedDict([(PROCESS_TYPE_MONITOR, []),
(PROCESS_TYPE_WEB_UI, [])],)
# True if processes are run in the valgrind profiler.
RUN_RAYLET_PROFILER = False
RUN_LOCAL_SCHEDULER_PROFILER = False
RUN_PLASMA_MANAGER_PROFILER = False
RUN_PLASMA_STORE_PROFILER = False
@@ -74,6 +77,10 @@ CREDIS_MEMBER_MODULE = os.path.join(
os.path.abspath(os.path.dirname(__file__)),
"core/src/credis/build/src/libmember.so")
# Location of the raylet executable.
RAYLET_EXECUTABLE = os.path.join(
os.path.abspath(os.path.dirname(__file__)),
"core/src/ray/raylet/raylet")
# ObjectStoreAddress tuples contain all information necessary to connect to an
# object store. The fields are:
@@ -123,8 +130,8 @@ def kill_process(p):
if p.poll() is not None:
# The process has already terminated.
return True
if any([RUN_LOCAL_SCHEDULER_PROFILER, RUN_PLASMA_MANAGER_PROFILER,
RUN_PLASMA_STORE_PROFILER]):
if any([RUN_RAYLET_PROFILER, RUN_LOCAL_SCHEDULER_PROFILER,
RUN_PLASMA_MANAGER_PROFILER, RUN_PLASMA_STORE_PROFILER]):
# Give process signal to write profiler data.
os.kill(p.pid, signal.SIGINT)
# Wait for profiling data to be written.
@@ -860,12 +867,73 @@ def start_local_scheduler(redis_address,
return local_scheduler_name
def start_raylet(redis_address,
node_ip_address,
plasma_store_name,
worker_path,
stdout_file=None,
stderr_file=None,
cleanup=True):
"""Start a raylet, which is a combined local scheduler and object manager.
Args:
redis_address (str): The address of the Redis instance.
node_ip_address (str): The IP address of the node that this local
scheduler is running on.
plasma_store_name (str): The name of the plasma store socket to connect
to.
worker_path (str): The path of the script to use when the local
scheduler starts up new workers.
stdout_file: A file handle opened for writing to redirect stdout to. If
no redirection should happen, then this should be None.
stderr_file: A file handle opened for writing to redirect stderr to. If
no redirection should happen, then this should be None.
cleanup (bool): True if using Ray in local mode. If cleanup is true,
then this process will be killed by serices.cleanup() when the
Python process that imported services exits.
Returns:
The raylet socket name.
"""
gcs_ip_address, gcs_port = redis_address.split(":")
raylet_name = "/tmp/raylet{}".format(random_name())
# Create the command that the Raylet will use to start workers.
start_worker_command = ("{} {} "
"--node-ip-address={} "
"--object-store-name={} "
"--raylet-name={} "
"--redis-address={}"
.format(sys.executable,
worker_path,
node_ip_address,
plasma_store_name,
raylet_name,
redis_address))
command = [RAYLET_EXECUTABLE,
raylet_name,
plasma_store_name,
node_ip_address,
gcs_ip_address,
gcs_port,
start_worker_command]
pid = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
if cleanup:
all_processes[PROCESS_TYPE_RAYLET].append(pid)
record_log_files_in_redis(redis_address, node_ip_address,
[stdout_file, stderr_file])
return raylet_name
def start_objstore(node_ip_address, redis_address,
object_manager_port=None, store_stdout_file=None,
store_stderr_file=None, manager_stdout_file=None,
manager_stderr_file=None, objstore_memory=None,
cleanup=True, plasma_directory=None,
huge_pages=False):
huge_pages=False, use_raylet=False):
"""This method starts an object store process.
Args:
@@ -893,6 +961,8 @@ def start_objstore(node_ip_address, redis_address,
be created.
huge_pages: Boolean flag indicating whether to start the Object
Store with hugetlbfs support. Requires plasma_directory.
use_raylet: True if the new raylet code path should be used. This is
not supported yet.
Return:
A tuple of the Plasma store socket name, the Plasma manager socket
@@ -936,33 +1006,41 @@ def start_objstore(node_ip_address, redis_address,
plasma_directory=plasma_directory,
huge_pages=huge_pages)
# Start the plasma manager.
if object_manager_port is not None:
(plasma_manager_name, p2,
plasma_manager_port) = ray.plasma.start_plasma_manager(
plasma_store_name,
redis_address,
plasma_manager_port=object_manager_port,
node_ip_address=node_ip_address,
num_retries=1,
run_profiler=RUN_PLASMA_MANAGER_PROFILER,
stdout_file=manager_stdout_file,
stderr_file=manager_stderr_file)
assert plasma_manager_port == object_manager_port
if not use_raylet:
if object_manager_port is not None:
(plasma_manager_name, p2,
plasma_manager_port) = ray.plasma.start_plasma_manager(
plasma_store_name,
redis_address,
plasma_manager_port=object_manager_port,
node_ip_address=node_ip_address,
num_retries=1,
run_profiler=RUN_PLASMA_MANAGER_PROFILER,
stdout_file=manager_stdout_file,
stderr_file=manager_stderr_file)
assert plasma_manager_port == object_manager_port
else:
(plasma_manager_name, p2,
plasma_manager_port) = ray.plasma.start_plasma_manager(
plasma_store_name,
redis_address,
node_ip_address=node_ip_address,
run_profiler=RUN_PLASMA_MANAGER_PROFILER,
stdout_file=manager_stdout_file,
stderr_file=manager_stderr_file)
else:
(plasma_manager_name, p2,
plasma_manager_port) = ray.plasma.start_plasma_manager(
plasma_store_name,
redis_address,
node_ip_address=node_ip_address,
run_profiler=RUN_PLASMA_MANAGER_PROFILER,
stdout_file=manager_stdout_file,
stderr_file=manager_stderr_file)
plasma_manager_port = None
plasma_manager_name = None
if cleanup:
all_processes[PROCESS_TYPE_PLASMA_STORE].append(p1)
all_processes[PROCESS_TYPE_PLASMA_MANAGER].append(p2)
record_log_files_in_redis(redis_address, node_ip_address,
[store_stdout_file, store_stderr_file,
manager_stdout_file, manager_stderr_file])
[store_stdout_file, store_stderr_file])
if not use_raylet:
if cleanup:
all_processes[PROCESS_TYPE_PLASMA_MANAGER].append(p2)
record_log_files_in_redis(redis_address, node_ip_address,
[manager_stdout_file, manager_stderr_file])
return ObjectStoreAddress(plasma_store_name, plasma_manager_name,
plasma_manager_port)
@@ -1059,7 +1137,8 @@ def start_ray_processes(address_info=None,
resources=None,
plasma_directory=None,
huge_pages=False,
autoscaling_config=None):
autoscaling_config=None,
use_raylet=False):
"""Helper method to start Ray processes.
Args:
@@ -1112,6 +1191,8 @@ def start_ray_processes(address_info=None,
huge_pages: Boolean flag indicating whether to start the Object
Store with hugetlbfs support. Requires plasma_directory.
autoscaling_config: path to autoscaling config file.
use_raylet: True if the new raylet code path should be used. This is
not supported yet.
Returns:
A dictionary of the address information for the processes that were
@@ -1193,7 +1274,7 @@ def start_ray_processes(address_info=None,
cleanup=cleanup)
# Start the global scheduler, if necessary.
if include_global_scheduler:
if include_global_scheduler and not use_raylet:
global_scheduler_stdout_file, global_scheduler_stderr_file = (
new_log_files("global_scheduler", redirect_output))
start_global_scheduler(redis_address,
@@ -1235,71 +1316,90 @@ def start_ray_processes(address_info=None,
manager_stderr_file=plasma_manager_stderr_file,
objstore_memory=object_store_memory,
cleanup=cleanup, plasma_directory=plasma_directory,
huge_pages=huge_pages)
huge_pages=huge_pages,
use_raylet=use_raylet)
object_store_addresses.append(object_store_address)
time.sleep(0.1)
# Start any local schedulers that do not yet exist.
for i in range(len(local_scheduler_socket_names), num_local_schedulers):
# Connect the local scheduler to the object store at the same index.
object_store_address = object_store_addresses[i]
plasma_address = "{}:{}".format(node_ip_address,
object_store_address.manager_port)
# Determine how many workers this local scheduler should start.
if start_workers_from_local_scheduler:
num_local_scheduler_workers = workers_per_local_scheduler[i]
workers_per_local_scheduler[i] = 0
else:
# If we're starting the workers from Python, the local scheduler
# should not start any workers.
num_local_scheduler_workers = 0
# Start the local scheduler. Note that if we do not wish to redirect
# the worker output, then we cannot redirect the local scheduler
# output.
local_scheduler_stdout_file, local_scheduler_stderr_file = (
new_log_files("local_scheduler_{}".format(i),
redirect_output=redirect_worker_output))
local_scheduler_name = start_local_scheduler(
if not use_raylet:
for i in range(len(local_scheduler_socket_names),
num_local_schedulers):
# Connect the local scheduler to the object store at the same
# index.
object_store_address = object_store_addresses[i]
plasma_address = "{}:{}".format(node_ip_address,
object_store_address.manager_port)
# Determine how many workers this local scheduler should start.
if start_workers_from_local_scheduler:
num_local_scheduler_workers = workers_per_local_scheduler[i]
workers_per_local_scheduler[i] = 0
else:
# If we're starting the workers from Python, the local
# scheduler should not start any workers.
num_local_scheduler_workers = 0
# Start the local scheduler. Note that if we do not wish to
# redirect the worker output, then we cannot redirect the local
# scheduler output.
local_scheduler_stdout_file, local_scheduler_stderr_file = (
new_log_files("local_scheduler_{}".format(i),
redirect_output=redirect_worker_output))
local_scheduler_name = start_local_scheduler(
redis_address,
node_ip_address,
object_store_address.name,
object_store_address.manager_name,
worker_path,
plasma_address=plasma_address,
stdout_file=local_scheduler_stdout_file,
stderr_file=local_scheduler_stderr_file,
cleanup=cleanup,
resources=resources[i],
num_workers=num_local_scheduler_workers)
local_scheduler_socket_names.append(local_scheduler_name)
# Make sure that we have exactly num_local_schedulers instances of
# object stores and local schedulers.
assert len(object_store_addresses) == num_local_schedulers
assert len(local_scheduler_socket_names) == num_local_schedulers
else:
# Start the raylet. TODO(rkn): Modify this to allow starting
# multiple raylets on the same machine.
raylet_stdout_file, raylet_stderr_file = (
new_log_files("raylet_{}".format(i),
redirect_output=redirect_output))
address_info["raylet_socket_name"] = start_raylet(
redis_address,
node_ip_address,
object_store_address.name,
object_store_address.manager_name,
object_store_addresses[i].name,
worker_path,
plasma_address=plasma_address,
stdout_file=local_scheduler_stdout_file,
stderr_file=local_scheduler_stderr_file,
cleanup=cleanup,
resources=resources[i],
num_workers=num_local_scheduler_workers)
local_scheduler_socket_names.append(local_scheduler_name)
time.sleep(0.1)
stdout_file=None,
stderr_file=None,
cleanup=cleanup)
# Make sure that we have exactly num_local_schedulers instances of object
# stores and local schedulers.
assert len(object_store_addresses) == num_local_schedulers
assert len(local_scheduler_socket_names) == num_local_schedulers
if not use_raylet:
# Start any workers that the local scheduler has not already started.
for i, num_local_scheduler_workers in enumerate(
workers_per_local_scheduler):
object_store_address = object_store_addresses[i]
local_scheduler_name = local_scheduler_socket_names[i]
for j in range(num_local_scheduler_workers):
worker_stdout_file, worker_stderr_file = new_log_files(
"worker_{}_{}".format(i, j), redirect_output)
start_worker(node_ip_address,
object_store_address.name,
object_store_address.manager_name,
local_scheduler_name,
redis_address,
worker_path,
stdout_file=worker_stdout_file,
stderr_file=worker_stderr_file,
cleanup=cleanup)
workers_per_local_scheduler[i] -= 1
# Start any workers that the local scheduler has not already started.
for i, num_local_scheduler_workers in enumerate(
workers_per_local_scheduler):
object_store_address = object_store_addresses[i]
local_scheduler_name = local_scheduler_socket_names[i]
for j in range(num_local_scheduler_workers):
worker_stdout_file, worker_stderr_file = new_log_files(
"worker_{}_{}".format(i, j), redirect_output)
start_worker(node_ip_address,
object_store_address.name,
object_store_address.manager_name,
local_scheduler_name,
redis_address,
worker_path,
stdout_file=worker_stdout_file,
stderr_file=worker_stderr_file,
cleanup=cleanup)
workers_per_local_scheduler[i] -= 1
# Make sure that we've started all the workers.
assert(sum(workers_per_local_scheduler) == 0)
# Make sure that we've started all the workers.
assert(sum(workers_per_local_scheduler) == 0)
# Try to start the web UI.
if include_webui:
@@ -1327,7 +1427,8 @@ def start_ray_node(node_ip_address,
redirect_output=False,
resources=None,
plasma_directory=None,
huge_pages=False):
huge_pages=False,
use_raylet=False):
"""Start the Ray processes for a single node.
This assumes that the Ray processes on some master node have already been
@@ -1360,6 +1461,8 @@ def start_ray_node(node_ip_address,
be created.
huge_pages: Boolean flag indicating whether to start the Object
Store with hugetlbfs support. Requires plasma_directory.
use_raylet: True if the new raylet code path should be used. This is
not supported yet.
Returns:
A dictionary of the address information for the processes that were
@@ -1400,7 +1503,8 @@ def start_ray_head(address_info=None,
include_webui=True,
plasma_directory=None,
huge_pages=False,
autoscaling_config=None):
autoscaling_config=None,
use_raylet=False):
"""Start Ray in local mode.
Args:
@@ -1447,6 +1551,8 @@ def start_ray_head(address_info=None,
huge_pages: Boolean flag indicating whether to start the Object
Store with hugetlbfs support. Requires plasma_directory.
autoscaling_config: path to autoscaling config file.
use_raylet: True if the new raylet code path should be used. This is
not supported yet.
Returns:
A dictionary of the address information for the processes that were
@@ -1474,7 +1580,8 @@ def start_ray_head(address_info=None,
redis_max_clients=redis_max_clients,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
autoscaling_config=autoscaling_config)
autoscaling_config=autoscaling_config,
use_raylet=use_raylet)
def try_to_create_directory(directory_path):
+160 -78
View File
@@ -31,6 +31,9 @@ import ray.plasma
from ray.utils import (FunctionProperties, random_string, binary_to_hex,
is_cython)
# Import flatbuffer bindings.
from ray.core.generated.ClientTableData import ClientTableData
SCRIPT_MODE = 0
WORKER_MODE = 1
PYTHON_MODE = 2
@@ -50,6 +53,7 @@ NIL_LOCAL_SCHEDULER_ID = NIL_ID
NIL_FUNCTION_ID = NIL_ID
NIL_ACTOR_ID = NIL_ID
NIL_ACTOR_HANDLE_ID = NIL_ID
NIL_CLIENT_ID = 20 * b"\xff"
# This must be kept in sync with the `error_types` array in
# common/state/error_table.h.
@@ -452,9 +456,12 @@ class Worker(object):
for object_id in object_ids]
for i in range(0, len(object_ids),
ray._config.worker_fetch_request_size()):
self.plasma_client.fetch(
plain_object_ids[i:(i +
ray._config.worker_fetch_request_size())])
if not self.use_raylet:
self.plasma_client.fetch(
plain_object_ids
[i:(i + ray._config.worker_fetch_request_size())])
else:
print("plasma_client.fetch has not been implemented yet")
# Get the objects. We initially try to get the objects immediately.
final_results = self.retrieve_and_deserialize(plain_object_ids, 0)
@@ -478,9 +485,12 @@ class Worker(object):
plasma.ObjectID, unready_ids.keys()))
for i in range(0, len(object_ids_to_fetch),
ray._config.worker_fetch_request_size()):
self.plasma_client.fetch(
object_ids_to_fetch[i:(
i + ray._config.worker_fetch_request_size())])
if not self.use_raylet:
self.plasma_client.fetch(
object_ids_to_fetch[i:(
i + ray._config.worker_fetch_request_size())])
else:
print("plasma_client.fetch has not been implemented yet")
results = self.retrieve_and_deserialize(
object_ids_to_fetch,
max([ray._config.get_timeout_milliseconds(),
@@ -496,7 +506,7 @@ class Worker(object):
# If there were objects that we weren't able to get locally, let the
# local scheduler know that we're now unblocked.
if was_blocked:
if was_blocked and not self.use_raylet:
self.local_scheduler_client.notify_unblocked()
assert len(final_results) == len(object_ids)
@@ -1150,70 +1160,108 @@ def _initialize_serialization(worker=global_worker):
use_dict=True)
def get_address_info_from_redis_helper(redis_address, node_ip_address):
def get_address_info_from_redis_helper(redis_address, node_ip_address,
use_raylet=False):
redis_ip_address, redis_port = redis_address.split(":")
# For this command to work, some other client (on the same machine as
# Redis) must have run "CONFIG SET protected-mode no".
redis_client = redis.StrictRedis(host=redis_ip_address,
port=int(redis_port))
# The client table prefix must be kept in sync with the file
# "src/common/redis_module/ray_redis_module.cc" where it is defined.
REDIS_CLIENT_TABLE_PREFIX = "CL:"
client_keys = redis_client.keys("{}*".format(REDIS_CLIENT_TABLE_PREFIX))
# Filter to live clients on the same node and do some basic checking.
plasma_managers = []
local_schedulers = []
for key in client_keys:
info = redis_client.hgetall(key)
# Ignore clients that were deleted.
deleted = info[b"deleted"]
deleted = bool(int(deleted))
if deleted:
continue
if not use_raylet:
# The client table prefix must be kept in sync with the file
# "src/common/redis_module/ray_redis_module.cc" where it is defined.
REDIS_CLIENT_TABLE_PREFIX = "CL:"
client_keys = redis_client.keys(
"{}*".format(REDIS_CLIENT_TABLE_PREFIX))
# Filter to live clients on the same node and do some basic checking.
plasma_managers = []
local_schedulers = []
for key in client_keys:
info = redis_client.hgetall(key)
assert b"ray_client_id" in info
assert b"node_ip_address" in info
assert b"client_type" in info
client_node_ip_address = info[b"node_ip_address"].decode("ascii")
if (client_node_ip_address == node_ip_address or
(client_node_ip_address == "127.0.0.1" and
redis_ip_address == ray.services.get_node_ip_address())):
if info[b"client_type"].decode("ascii") == "plasma_manager":
plasma_managers.append(info)
elif info[b"client_type"].decode("ascii") == "local_scheduler":
local_schedulers.append(info)
# Make sure that we got at least one plasma manager and local scheduler.
assert len(plasma_managers) >= 1
assert len(local_schedulers) >= 1
# Build the address information.
object_store_addresses = []
for manager in plasma_managers:
address = manager[b"manager_address"].decode("ascii")
port = services.get_port(address)
object_store_addresses.append(
services.ObjectStoreAddress(
name=manager[b"store_socket_name"].decode("ascii"),
manager_name=manager[b"manager_socket_name"].decode("ascii"),
manager_port=port))
scheduler_names = [
scheduler[b"local_scheduler_socket_name"].decode("ascii")
for scheduler in local_schedulers]
client_info = {"node_ip_address": node_ip_address,
"redis_address": redis_address,
"object_store_addresses": object_store_addresses,
"local_scheduler_socket_names": scheduler_names,
# Web UI should be running.
"webui_url": _webui_url_helper(redis_client)}
return client_info
# Ignore clients that were deleted.
deleted = info[b"deleted"]
deleted = bool(int(deleted))
if deleted:
continue
assert b"ray_client_id" in info
assert b"node_ip_address" in info
assert b"client_type" in info
client_node_ip_address = info[b"node_ip_address"].decode("ascii")
if (client_node_ip_address == node_ip_address or
(client_node_ip_address == "127.0.0.1" and
redis_ip_address == ray.services.get_node_ip_address())):
if info[b"client_type"].decode("ascii") == "plasma_manager":
plasma_managers.append(info)
elif info[b"client_type"].decode("ascii") == "local_scheduler":
local_schedulers.append(info)
# Make sure that we got at least one plasma manager and local
# scheduler.
assert len(plasma_managers) >= 1
assert len(local_schedulers) >= 1
# Build the address information.
object_store_addresses = []
for manager in plasma_managers:
address = manager[b"manager_address"].decode("ascii")
port = services.get_port(address)
object_store_addresses.append(
services.ObjectStoreAddress(
name=manager[b"store_socket_name"].decode("ascii"),
manager_name=manager[b"manager_socket_name"].decode(
"ascii"),
manager_port=port))
scheduler_names = [
scheduler[b"local_scheduler_socket_name"].decode("ascii")
for scheduler in local_schedulers]
client_info = {"node_ip_address": node_ip_address,
"redis_address": redis_address,
"object_store_addresses": object_store_addresses,
"local_scheduler_socket_names": scheduler_names,
# Web UI should be running.
"webui_url": _webui_url_helper(redis_client)}
return client_info
# Handle the raylet case.
else:
# In the raylet code path, all client data is stored in a zset at the
# key for the nil client.
client_key = b"CLIENT:" + NIL_CLIENT_ID
clients = redis_client.zrange(client_key, 0, -1)
raylets = []
for client_message in clients:
client = ClientTableData.GetRootAsClientTableData(client_message,
0)
client_node_ip_address = client.NodeManagerAddress().decode(
"ascii")
if (client_node_ip_address == node_ip_address or
(client_node_ip_address == "127.0.0.1" and
redis_ip_address == ray.services.get_node_ip_address())):
raylets.append(client)
# TODO(rkn): The ObjectStoreSocketName field does not exist.
object_store_addresses = [
raylet.ObjectStoreSocketName().decode("ascii")
for raylet in raylets]
raylet_socket_names = [raylet.NodeManagerAddress().decode("ascii") for
raylet in raylets]
return {"node_ip_address": node_ip_address,
"redis_address": redis_address,
"object_store_addresses": object_store_addresses,
"raylet_socket_names": raylet_socket_names,
# Web UI should be running.
"webui_url": _webui_url_helper(redis_client)}
def get_address_info_from_redis(redis_address, node_ip_address, num_retries=5):
def get_address_info_from_redis(redis_address, node_ip_address, num_retries=5,
use_raylet=False):
counter = 0
while True:
try:
return get_address_info_from_redis_helper(redis_address,
node_ip_address)
node_ip_address,
use_raylet=use_raylet)
except Exception as e:
if counter == num_retries:
raise
@@ -1281,7 +1329,8 @@ def _init(address_info=None,
redis_max_clients=None,
plasma_directory=None,
huge_pages=False,
include_webui=True):
include_webui=True,
use_raylet=False):
"""Helper method to connect to an existing Ray cluster or start a new one.
This method handles two cases. Either a Ray cluster already exists and we
@@ -1336,6 +1385,8 @@ def _init(address_info=None,
Store with hugetlbfs support. Requires plasma_directory.
include_webui: Boolean flag indicating whether to start the web
UI, which is a Jupyter notebook.
use_raylet: True if the new raylet code path should be used. This is
not supported yet.
Returns:
Address information about the started processes.
@@ -1402,7 +1453,8 @@ def _init(address_info=None,
redis_max_clients=redis_max_clients,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
include_webui=include_webui)
include_webui=include_webui,
use_raylet=use_raylet)
else:
if redis_address is None:
raise Exception("When connecting to an existing cluster, "
@@ -1439,7 +1491,8 @@ def _init(address_info=None,
node_ip_address = services.get_node_ip_address(redis_address)
# Get the address info of the processes to connect to from Redis.
address_info = get_address_info_from_redis(redis_address,
node_ip_address)
node_ip_address,
use_raylet=use_raylet)
# Connect this driver to Redis, the object store, and the local scheduler.
# Choose the first object store and local scheduler if there are multiple.
@@ -1453,13 +1506,17 @@ def _init(address_info=None,
"redis_address": address_info["redis_address"],
"store_socket_name": (
address_info["object_store_addresses"][0].name),
"manager_socket_name": (
address_info["object_store_addresses"][0].manager_name),
"local_scheduler_socket_name": (
address_info["local_scheduler_socket_names"][0]),
"webui_url": address_info["webui_url"]}
if not use_raylet:
driver_address_info["manager_socket_name"] = (
address_info["object_store_addresses"][0].manager_name)
driver_address_info["local_scheduler_socket_name"] = (
address_info["local_scheduler_socket_names"][0])
else:
driver_address_info["raylet_socket_name"] = (
address_info["raylet_socket_name"])
connect(driver_address_info, object_id_seed=object_id_seed,
mode=driver_mode, worker=global_worker)
mode=driver_mode, worker=global_worker, use_raylet=use_raylet)
return address_info
@@ -1469,7 +1526,8 @@ def init(redis_address=None, node_ip_address=None, object_id_seed=None,
num_cpus=None, num_gpus=None, resources=None,
num_custom_resource=None, num_redis_shards=None,
redis_max_clients=None, plasma_directory=None,
huge_pages=False, include_webui=True, object_store_memory=None):
huge_pages=False, include_webui=True, object_store_memory=None,
use_raylet=False):
"""Connect to an existing Ray cluster or start one and connect to it.
This method handles two cases. Either a Ray cluster already exists and we
@@ -1513,6 +1571,9 @@ def init(redis_address=None, node_ip_address=None, object_id_seed=None,
UI, which is a Jupyter notebook.
object_store_memory: The amount of memory (in bytes) to start the
object store with.
use_raylet: True if the new raylet code path should be used. This is
not supported yet.
Returns:
Address information about the started processes.
@@ -1539,7 +1600,8 @@ def init(redis_address=None, node_ip_address=None, object_id_seed=None,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
include_webui=include_webui,
object_store_memory=object_store_memory)
object_store_memory=object_store_memory,
use_raylet=use_raylet)
def cleanup(worker=global_worker):
@@ -1818,7 +1880,8 @@ def import_thread(worker, mode):
pass
def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker,
use_raylet=False):
"""Connect this worker to the local scheduler, to Plasma, and to Redis.
Args:
@@ -1828,6 +1891,8 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
deterministic.
mode: The mode of the worker. One of SCRIPT_MODE, WORKER_MODE,
PYTHON_MODE, and SILENT_MODE.
use_raylet: True if the new raylet code path should be used. This is
not supported yet.
"""
check_main_thread()
# Do some basic checking to make sure we didn't call ray.init twice.
@@ -1842,6 +1907,7 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
worker.actor_id = NIL_ACTOR_ID
worker.connected = True
worker.set_mode(mode)
worker.use_raylet = use_raylet
# The worker.events field is used to aggregate logging information and
# display it in the web UI. Note that Python lists protected by the GIL,
# which is important because we will append to this field from multiple
@@ -1909,8 +1975,9 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
"driver_id": worker.worker_id,
"start_time": time.time(),
"plasma_store_socket": info["store_socket_name"],
"plasma_manager_socket": info["manager_socket_name"],
"local_scheduler_socket": info["local_scheduler_socket_name"]}
"plasma_manager_socket": info.get("manager_socket_name"),
"local_scheduler_socket": info.get("local_scheduler_socket_name"),
"raylet_socket": info.get("raylet_socket_name")}
driver_info["name"] = (main.__file__ if hasattr(main, "__file__")
else "INTERACTIVE MODE")
worker.redis_client.hmset(b"Drivers:" + worker.worker_id, driver_info)
@@ -1933,11 +2000,22 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
raise Exception("This code should be unreachable.")
# Create an object store client.
worker.plasma_client = plasma.connect(info["store_socket_name"],
info["manager_socket_name"],
64)
if not worker.use_raylet:
worker.plasma_client = plasma.connect(info["store_socket_name"],
info["manager_socket_name"],
64)
else:
worker.plasma_client = plasma.connect(info["store_socket_name"],
"",
64)
if not worker.use_raylet:
local_scheduler_socket = info["local_scheduler_socket_name"]
else:
local_scheduler_socket = info["raylet_socket_name"]
worker.local_scheduler_client = ray.local_scheduler.LocalSchedulerClient(
info["local_scheduler_socket_name"], worker.worker_id, is_worker)
local_scheduler_socket, worker.worker_id, is_worker)
# If this is a driver, set the current task ID, the task driver ID, and set
# the task index to 0.
@@ -2275,9 +2353,10 @@ def flush_log(worker=global_worker):
"""Send the logged worker events to the global state store."""
event_log_key = b"event_log:" + worker.worker_id
event_log_value = json.dumps(worker.events)
worker.local_scheduler_client.log_event(event_log_key,
event_log_value,
time.time())
if not worker.use_raylet:
worker.local_scheduler_client.log_event(event_log_key,
event_log_value,
time.time())
worker.events = []
@@ -2367,6 +2446,9 @@ def wait(object_ids, num_returns=1, timeout=None, worker=global_worker):
A list of object IDs that are ready and a list of the remaining object
IDs.
"""
if worker.use_raylet:
print("plasma_client.wait has not been implemented yet")
return
if isinstance(object_ids, ray.local_scheduler.ObjectID):
raise TypeError(
+8 -4
View File
@@ -16,10 +16,12 @@ parser.add_argument("--redis-address", required=True, type=str,
help="the address to use for Redis")
parser.add_argument("--object-store-name", required=True, type=str,
help="the object store's name")
parser.add_argument("--object-store-manager-name", required=True, type=str,
parser.add_argument("--object-store-manager-name", required=False, type=str,
help="the object store manager's name")
parser.add_argument("--local-scheduler-name", required=True, type=str,
parser.add_argument("--local-scheduler-name", required=False, type=str,
help="the local scheduler's name")
parser.add_argument("--raylet-name", required=False, type=str,
help="the raylet's name")
if __name__ == "__main__":
@@ -29,9 +31,11 @@ if __name__ == "__main__":
"redis_address": args.redis_address,
"store_socket_name": args.object_store_name,
"manager_socket_name": args.object_store_manager_name,
"local_scheduler_socket_name": args.local_scheduler_name}
"local_scheduler_socket_name": args.local_scheduler_name,
"raylet_socket_name": args.raylet_name}
ray.worker.connect(info, mode=ray.WORKER_MODE)
ray.worker.connect(info, mode=ray.WORKER_MODE,
use_raylet=(args.raylet_name is not None))
error_explanation = """
This error is unexpected and should not have happened. Somehow a worker
+1
View File
@@ -23,6 +23,7 @@ ray_files = [
"ray/core/src/local_scheduler/local_scheduler",
"ray/core/src/local_scheduler/liblocal_scheduler_library.so",
"ray/core/src/global_scheduler/global_scheduler",
"ray/core/src/ray/raylet/raylet",
"ray/WebUI.ipynb"
]