mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 01:00:10 +08:00
Implement a first pass at actors in the API. (#242)
* Implement actor field for tasks * Implement actor management in local scheduler. * initial python frontend for actors * import actors on worker * IPython code completion and tests * prepare creating actors through local schedulers * add actor id to PyTask * submit actor calls to local scheduler * starting to integrate * simple fix * Fixes from rebasing. * more work on python actors * Improve local scheduler actor handlers. * Pass actor ID to local scheduler when connecting a client. * first working version of actors * fixing actors * fix creating two copies of the same actor * fix actors * remove sleep * get rid of export synchronization * update * insert actor methods into the queue in the right order * remove print statements * make it compile again after rebase * Minor updates. * fix python actor ids * Pass actor_id to start_worker. * add test * Minor changes. * Update actor tests. * Temporary plan for import counter. * Temporarily fix import counters. * Fix some tests. * Fixes. * Make actor creation non-blocking. * Fix test? * Fix actors on Python 2. * fix rare case. * Fix python 2 test. * More tests. * Small fixes. * Linting. * Revert tensorflow version to 0.12.0 temporarily. * Small fix. * Enhance inheritance test.
This commit is contained in:
committed by
Robert Nishihara
parent
072eadd57f
commit
12a68e84d2
@@ -23,6 +23,8 @@ PLASMA_STORE_MEMORY = 1000000000
|
||||
ID_SIZE = 20
|
||||
NUM_CLUSTER_NODES = 2
|
||||
|
||||
NIL_ACTOR_ID = 20 * b"\xff"
|
||||
|
||||
# These constants must match the scheduling state enum in task.h.
|
||||
TASK_STATUS_WAITING = 1
|
||||
TASK_STATUS_SCHEDULED = 2
|
||||
@@ -92,7 +94,7 @@ class TestGlobalScheduler(unittest.TestCase):
|
||||
redis_address=redis_address,
|
||||
static_resource_list=[10, 0])
|
||||
# Connect to the scheduler.
|
||||
photon_client = photon.PhotonClient(local_scheduler_name)
|
||||
photon_client = photon.PhotonClient(local_scheduler_name, NIL_ACTOR_ID)
|
||||
self.photon_clients.append(photon_client)
|
||||
self.local_scheduler_pids.append(p4)
|
||||
|
||||
@@ -149,7 +151,9 @@ class TestGlobalScheduler(unittest.TestCase):
|
||||
def test_task_default_resources(self):
|
||||
task1 = photon.Task(random_driver_id(), random_function_id(), [random_object_id()], 0, random_task_id(), 0)
|
||||
self.assertEqual(task1.required_resources(), [1.0, 0.0])
|
||||
task2 = photon.Task(random_driver_id(), random_function_id(), [random_object_id()], 0, random_task_id(), 0, [1.0, 2.0])
|
||||
task2 = photon.Task(random_driver_id(), random_function_id(),
|
||||
[random_object_id()], 0, random_task_id(), 0,
|
||||
photon.ObjectID(NIL_ACTOR_ID), 0, [1.0, 2.0])
|
||||
self.assertEqual(task2.required_resources(), [1.0, 2.0])
|
||||
|
||||
def test_redis_only_single_task(self):
|
||||
|
||||
@@ -18,6 +18,8 @@ import plasma
|
||||
USE_VALGRIND = False
|
||||
ID_SIZE = 20
|
||||
|
||||
NIL_ACTOR_ID = 20 * b"\xff"
|
||||
|
||||
def random_object_id():
|
||||
return photon.ObjectID(np.random.bytes(ID_SIZE))
|
||||
|
||||
@@ -39,7 +41,7 @@ class TestPhotonClient(unittest.TestCase):
|
||||
# Start a local scheduler.
|
||||
scheduler_name, self.p2 = photon.start_local_scheduler(plasma_store_name, use_valgrind=USE_VALGRIND)
|
||||
# Connect to the scheduler.
|
||||
self.photon_client = photon.PhotonClient(scheduler_name)
|
||||
self.photon_client = photon.PhotonClient(scheduler_name, NIL_ACTOR_ID)
|
||||
|
||||
def tearDown(self):
|
||||
# Check that the processes are still alive.
|
||||
|
||||
@@ -16,5 +16,6 @@ if hasattr(ctypes, "windll"):
|
||||
import ray.experimental
|
||||
import ray.serialization
|
||||
from ray.worker import register_class, error_info, init, connect, disconnect, get, put, wait, remote, log_event, log_span, flush_log
|
||||
from ray.actor import actor
|
||||
from ray.worker import EnvironmentVariable, env
|
||||
from ray.worker import SCRIPT_MODE, WORKER_MODE, PYTHON_MODE, SILENT_MODE
|
||||
|
||||
@@ -0,0 +1,141 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import hashlib
|
||||
import inspect
|
||||
import numpy as np
|
||||
import photon
|
||||
import random
|
||||
|
||||
import ray.pickling as pickling
|
||||
import ray.worker
|
||||
import ray.experimental.state as state
|
||||
|
||||
def random_string():
|
||||
return np.random.bytes(20)
|
||||
|
||||
def random_actor_id():
|
||||
return photon.ObjectID(random_string())
|
||||
|
||||
def get_actor_method_function_id(attr):
|
||||
"""Get the function ID corresponding to an actor method.
|
||||
|
||||
Args:
|
||||
attr (str): The attribute name of the method.
|
||||
|
||||
Returns:
|
||||
Function ID corresponding to the method.
|
||||
"""
|
||||
function_id = hashlib.sha1()
|
||||
function_id.update(attr.encode("ascii"))
|
||||
return photon.ObjectID(function_id.digest())
|
||||
|
||||
def fetch_and_register_actor(key, worker):
|
||||
"""Import an actor."""
|
||||
driver_id, actor_id_str, actor_name, module, pickled_class, class_export_counter = \
|
||||
worker.redis_client.hmget(key, ["driver_id", "actor_id", "name", "module", "class", "class_export_counter"])
|
||||
actor_id = photon.ObjectID(actor_id_str)
|
||||
actor_name = actor_name.decode("ascii")
|
||||
module = module.decode("ascii")
|
||||
class_export_counter = int(class_export_counter)
|
||||
try:
|
||||
unpickled_class = pickling.loads(pickled_class)
|
||||
except:
|
||||
raise NotImplemented("TODO(pcm)")
|
||||
else:
|
||||
# TODO(pcm): Why is the below line necessary?
|
||||
unpickled_class.__module__ = module
|
||||
worker.actors[actor_id_str] = unpickled_class.__new__(unpickled_class)
|
||||
for (k, v) in inspect.getmembers(unpickled_class, predicate=(lambda x: inspect.isfunction(x) or inspect.ismethod(x))):
|
||||
function_id = get_actor_method_function_id(k).id()
|
||||
worker.function_names[function_id] = k
|
||||
worker.functions[function_id] = v
|
||||
|
||||
def export_actor(actor_id, Class, worker):
|
||||
"""Export an actor to redis.
|
||||
|
||||
Args:
|
||||
actor_id: The ID of the actor.
|
||||
Class: Name of the class to be exported as an actor.
|
||||
worker: The worker class
|
||||
"""
|
||||
ray.worker.check_main_thread()
|
||||
if worker.mode is None:
|
||||
raise NotImplemented("TODO(pcm): Cache actors")
|
||||
key = "Actor:{}".format(actor_id.id())
|
||||
pickled_class = pickling.dumps(Class)
|
||||
|
||||
# Select a local scheduler for the actor.
|
||||
local_schedulers = state.get_local_schedulers()
|
||||
local_scheduler_id = random.choice(local_schedulers)
|
||||
|
||||
worker.redis_client.publish("actor_notifications", actor_id.id() + local_scheduler_id)
|
||||
|
||||
# The export counter is computed differently depending on whether we are
|
||||
# currently in a driver or a worker.
|
||||
if worker.mode in [ray.SCRIPT_MODE, ray.SILENT_MODE]:
|
||||
export_counter = worker.driver_export_counter
|
||||
elif worker.mode == ray.WORKER_MODE:
|
||||
# We don't actually need export counters for actors.
|
||||
export_counter = 0
|
||||
d = {"driver_id": worker.task_driver_id.id(),
|
||||
"actor_id": actor_id.id(),
|
||||
"name": Class.__name__,
|
||||
"module": Class.__module__,
|
||||
"class": pickled_class,
|
||||
"class_export_counter": export_counter}
|
||||
worker.redis_client.hmset(key, d)
|
||||
worker.redis_client.rpush("Exports", key)
|
||||
worker.driver_export_counter += 1
|
||||
|
||||
def actor(Class):
|
||||
# The function actor_method_call gets called if somebody tries to call a
|
||||
# method on their local actor stub object.
|
||||
def actor_method_call(actor_id, attr, *args, **kwargs):
|
||||
ray.worker.check_connected()
|
||||
ray.worker.check_main_thread()
|
||||
args = list(args)
|
||||
if len(kwargs) > 0:
|
||||
raise Exception("Actors currently do not support **kwargs.")
|
||||
function_id = get_actor_method_function_id(attr)
|
||||
# TODO(pcm): Extend args with keyword args.
|
||||
# For now, actor methods should not require resources beyond the resources
|
||||
# used by the actor.
|
||||
num_cpus = 0
|
||||
num_gpus = 0
|
||||
object_ids = ray.worker.global_worker.submit_task(function_id, "", args,
|
||||
num_cpus, num_gpus,
|
||||
actor_id=actor_id)
|
||||
if len(object_ids) == 1:
|
||||
return object_ids[0]
|
||||
elif len(object_ids) > 1:
|
||||
return object_ids
|
||||
|
||||
class NewClass(object):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._ray_actor_id = random_actor_id()
|
||||
self._ray_actor_methods = {k: v for (k, v) in inspect.getmembers(Class, predicate=(lambda x: inspect.isfunction(x) or inspect.ismethod(x)))}
|
||||
export_actor(self._ray_actor_id, Class, ray.worker.global_worker)
|
||||
# Call __init__ as a remote function.
|
||||
if "__init__" in self._ray_actor_methods.keys():
|
||||
actor_method_call(self._ray_actor_id, "__init__", *args, **kwargs)
|
||||
else:
|
||||
print("WARNING: this object has no __init__ method.")
|
||||
# Make tab completion work.
|
||||
def __dir__(self):
|
||||
return self._ray_actor_methods
|
||||
def __getattribute__(self, attr):
|
||||
# The following is needed so we can still access self.actor_methods.
|
||||
if attr in ["_ray_actor_id", "_ray_actor_methods"]:
|
||||
return super(NewClass, self).__getattribute__(attr)
|
||||
if attr in self._ray_actor_methods.keys():
|
||||
return lambda *args, **kwargs: actor_method_call(self._ray_actor_id, attr, *args, **kwargs)
|
||||
# There is no method with this name, so raise an exception.
|
||||
raise AttributeError("'{}' Actor object has no attribute '{}'".format(Class, attr))
|
||||
def __repr__(self):
|
||||
return "Actor(" + self._ray_actor_id.hex() + ")"
|
||||
|
||||
return NewClass
|
||||
|
||||
ray.worker.global_worker.fetch_and_register["Actor"] = fetch_and_register_actor
|
||||
@@ -0,0 +1,13 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import ray.worker
|
||||
|
||||
def get_local_schedulers():
|
||||
local_schedulers = []
|
||||
for client in ray.worker.global_worker.redis_client.keys("CL:*"):
|
||||
client_type, ray_client_id = ray.worker.global_worker.redis_client.hmget(client, "client_type", "ray_client_id")
|
||||
if client_type == b"photon":
|
||||
local_schedulers.append(ray_client_id)
|
||||
return local_schedulers
|
||||
+110
-24
@@ -9,6 +9,7 @@ import sys
|
||||
import time
|
||||
import traceback
|
||||
import copy
|
||||
import collections
|
||||
import funcsigs
|
||||
import numpy as np
|
||||
import colorama
|
||||
@@ -39,6 +40,9 @@ ERROR_KEY_PREFIX = b"Error:"
|
||||
DRIVER_ID_LENGTH = 20
|
||||
ERROR_ID_LENGTH = 20
|
||||
|
||||
# This must match the definition of NIL_ACTOR_ID in task.h.
|
||||
NIL_ACTOR_ID = 20 * b"\xff"
|
||||
|
||||
# When performing ray.get, wait 1 second before attemping to reconstruct and
|
||||
# fetch the object again.
|
||||
GET_TIMEOUT_MILLISECONDS = 1000
|
||||
@@ -378,15 +382,58 @@ class Worker(object):
|
||||
def __init__(self):
|
||||
"""Initialize a Worker object."""
|
||||
self.functions = {}
|
||||
self.num_return_vals = {}
|
||||
# Use a defaultdict for the number of return values. If this is accessed
|
||||
# with a missing key, the default value of 1 is returned, and that key value
|
||||
# pair is added to the dict.
|
||||
self.num_return_vals = collections.defaultdict(lambda: 1)
|
||||
self.function_names = {}
|
||||
self.function_export_counters = {}
|
||||
self.connected = False
|
||||
self.mode = None
|
||||
self.cached_remote_functions = []
|
||||
self.cached_functions_to_run = []
|
||||
# The driver_export_counter and worker_import_counter are used to make sure
|
||||
# that no task executes before everything it needs is present. For example,
|
||||
# if we define a remote function f, a worker cannot execute a task for f
|
||||
# until the worker has imported the function f.
|
||||
# - When a remote function, a reusable variable, or a function to run is
|
||||
# exported, the driver_export_counter is incremented. These exports must
|
||||
# take place from the driver.
|
||||
# - When an actor is created, the driver_export_counter is NOT
|
||||
# incremented. Note that an actor can be created from a driver or from
|
||||
# any worker.
|
||||
# - When a worker imports a remote function, a reusable variable, or a
|
||||
# function to run, its worker_import_counter is incremented.
|
||||
# - Notably, when an actor is imported, its worker_import_counter is NOT
|
||||
# incremented.
|
||||
# - Whenever a remote function is DEFINED on the driver, it records the
|
||||
# value of the driver_export_counter and a worker will not execute that
|
||||
# remote function until it has imported that many exports (excluding
|
||||
# actors).
|
||||
# - When an actor is defined.
|
||||
# a) If the actor is created on a driver, it records the
|
||||
# driver_export_counter.
|
||||
# b) If the actor is created inside a task on a regular worker, it
|
||||
# records the driver_export_counter associated with the function in
|
||||
# task creating the actor.
|
||||
# c) If the actor is created inside a task on an actor worker, it
|
||||
# records
|
||||
# The worker that ultimately runs the actor will not execute any tasks
|
||||
# until it has imported that many imports.
|
||||
#
|
||||
# TODO(rkn): These counters must be tracked separately for each driver.
|
||||
# TODO(rkn): Maybe none of these counters are necessary? When executing a
|
||||
# regular task, workers can just wait until the function ID is present. When
|
||||
# executing an actor task, the actor worker can just wait until the actor
|
||||
# has been defined.
|
||||
self.driver_export_counter = 0
|
||||
self.worker_import_counter = 0
|
||||
self.fetch_and_register = {}
|
||||
self.actors = {}
|
||||
# Use a defaultdict for the actor counts. If this is accessed with a missing
|
||||
# key, the default value of 0 is returned, and that key value pair is added
|
||||
# to the dict.
|
||||
self.actor_counters = collections.defaultdict(lambda: 0)
|
||||
|
||||
def set_mode(self, mode):
|
||||
"""Set the mode of the worker.
|
||||
@@ -479,7 +526,7 @@ class Worker(object):
|
||||
assert final_results[i][0] == object_ids[i].id()
|
||||
return [result[1][0] for result in final_results]
|
||||
|
||||
def submit_task(self, function_id, func_name, args, num_cpus, num_gpus):
|
||||
def submit_task(self, function_id, func_name, args, num_cpus, num_gpus, actor_id=photon.ObjectID(NIL_ACTOR_ID)):
|
||||
"""Submit a remote task to the scheduler.
|
||||
|
||||
Tell the scheduler to schedule the execution of the function with name
|
||||
@@ -514,10 +561,12 @@ class Worker(object):
|
||||
self.num_return_vals[function_id.id()],
|
||||
self.current_task_id,
|
||||
self.task_index,
|
||||
actor_id, self.actor_counters[actor_id],
|
||||
[num_cpus, num_gpus])
|
||||
# Increment the worker's task index to track how many tasks have been
|
||||
# submitted by the current task so far.
|
||||
self.task_index += 1
|
||||
self.actor_counters[actor_id] += 1
|
||||
self.photon_client.submit(task)
|
||||
|
||||
return task.returns()
|
||||
@@ -856,7 +905,7 @@ def _init(address_info=None,
|
||||
"manager_socket_name": address_info["object_store_addresses"][0].manager_name,
|
||||
"local_scheduler_socket_name": address_info["local_scheduler_socket_names"][0],
|
||||
}
|
||||
connect(driver_address_info, object_id_seed=object_id_seed, mode=driver_mode, worker=global_worker)
|
||||
connect(driver_address_info, object_id_seed=object_id_seed, mode=driver_mode, worker=global_worker, actor_id=NIL_ACTOR_ID)
|
||||
return address_info
|
||||
|
||||
def init(redis_address=None, node_ip_address=None, object_id_seed=None,
|
||||
@@ -1086,6 +1135,9 @@ def import_thread(worker):
|
||||
worker_info_key = "WorkerInfo:{}".format(worker.worker_id)
|
||||
worker.redis_client.hset(worker_info_key, "export_counter", 0)
|
||||
worker.worker_import_counter = 0
|
||||
# The number of imports is similar to the worker_import_counter except that it
|
||||
# also counts actors.
|
||||
num_imported = 0
|
||||
|
||||
# Get the exports that occurred before the call to psubscribe.
|
||||
with worker.lock:
|
||||
@@ -1097,10 +1149,19 @@ def import_thread(worker):
|
||||
fetch_and_register_environment_variable(key, worker=worker)
|
||||
elif key.startswith(b"FunctionsToRun"):
|
||||
fetch_and_execute_function_to_run(key, worker=worker)
|
||||
elif key.startswith(b"Actor"):
|
||||
# Only get the actor if the actor ID matches the actor ID of this
|
||||
# worker.
|
||||
actor_id, = worker.redis_client.hmget(key, "actor_id")
|
||||
if worker.actor_id == actor_id:
|
||||
worker.fetch_and_register["Actor"](key, worker)
|
||||
else:
|
||||
raise Exception("This code should be unreachable.")
|
||||
worker.redis_client.hincrby(worker_info_key, "export_counter", 1)
|
||||
worker.worker_import_counter += 1
|
||||
# Actors do not contribute to the import counter.
|
||||
if not key.startswith(b"Actor"):
|
||||
worker.redis_client.hincrby(worker_info_key, "export_counter", 1)
|
||||
worker.worker_import_counter += 1
|
||||
num_imported += 1
|
||||
|
||||
for msg in worker.import_pubsub_client.listen():
|
||||
with worker.lock:
|
||||
@@ -1108,8 +1169,8 @@ def import_thread(worker):
|
||||
continue
|
||||
assert msg["data"] == b"rpush"
|
||||
num_imports = worker.redis_client.llen("Exports")
|
||||
assert num_imports >= worker.worker_import_counter
|
||||
for i in range(worker.worker_import_counter, num_imports):
|
||||
assert num_imports >= num_imported
|
||||
for i in range(num_imported, num_imports):
|
||||
key = worker.redis_client.lindex("Exports", i)
|
||||
if key.startswith(b"RemoteFunction"):
|
||||
with log_span("ray:import_remote_function", worker=worker):
|
||||
@@ -1120,12 +1181,21 @@ def import_thread(worker):
|
||||
elif key.startswith(b"FunctionsToRun"):
|
||||
with log_span("ray:import_function_to_run", worker=worker):
|
||||
fetch_and_execute_function_to_run(key, worker=worker)
|
||||
elif key.startswith(b"Actor"):
|
||||
# Only get the actor if the actor ID matches the actor ID of this
|
||||
# worker.
|
||||
actor_id, = worker.redis_client.hmget(key, "actor_id")
|
||||
if worker.actor_id == actor_id:
|
||||
worker.fetch_and_register["Actor"](key, worker)
|
||||
else:
|
||||
raise Exception("This code should be unreachable.")
|
||||
worker.redis_client.hincrby(worker_info_key, "export_counter", 1)
|
||||
worker.worker_import_counter += 1
|
||||
# Actors do not contribute to the import counter.
|
||||
if not key.startswith(b"Actor"):
|
||||
worker.redis_client.hincrby(worker_info_key, "export_counter", 1)
|
||||
worker.worker_import_counter += 1
|
||||
num_imported += 1
|
||||
|
||||
def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
|
||||
def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker, actor_id=NIL_ACTOR_ID):
|
||||
"""Connect this worker to the local scheduler, to Plasma, and to Redis.
|
||||
|
||||
Args:
|
||||
@@ -1143,6 +1213,7 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
|
||||
assert env._cached_environment_variables is not None, error_message
|
||||
# Initialize some fields.
|
||||
worker.worker_id = random_string()
|
||||
worker.actor_id = actor_id
|
||||
worker.connected = True
|
||||
worker.set_mode(mode)
|
||||
# The worker.events field is used to aggregate logging information and display
|
||||
@@ -1163,7 +1234,8 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
|
||||
# Create an object store client.
|
||||
worker.plasma_client = plasma.PlasmaClient(info["store_socket_name"], info["manager_socket_name"])
|
||||
# Create the local scheduler client.
|
||||
worker.photon_client = photon.PhotonClient(info["local_scheduler_socket_name"])
|
||||
worker.photon_client = photon.PhotonClient(info["local_scheduler_socket_name"], worker.actor_id)
|
||||
# Register the worker with Redis.
|
||||
if mode in [SCRIPT_MODE, SILENT_MODE]:
|
||||
# The concept of a driver is the same as the concept of a "job". Register
|
||||
# the driver/job with Redis here.
|
||||
@@ -1458,7 +1530,11 @@ def wait_for_valid_import_counter(function_id, driver_id, timeout=5, worker=glob
|
||||
may indicate a problem somewhere and we will push an error message to the
|
||||
user.
|
||||
|
||||
If this worker is an actor, then this will wait until the actor has been
|
||||
defined.
|
||||
|
||||
Args:
|
||||
is_actor (bool): True if this worker is an actor, and false otherwise.
|
||||
function_id (str): The ID of the function that we want to execute.
|
||||
driver_id (str): The ID of the driver to push the error message to if this
|
||||
times out.
|
||||
@@ -1469,17 +1545,19 @@ def wait_for_valid_import_counter(function_id, driver_id, timeout=5, worker=glob
|
||||
num_warnings_sent = 0
|
||||
while True:
|
||||
with worker.lock:
|
||||
if function_id.id() in worker.functions and (worker.function_export_counters[function_id.id()] <= worker.worker_import_counter):
|
||||
if worker.actor_id == NIL_ACTOR_ID and function_id.id() in worker.functions and (worker.function_export_counters[function_id.id()] <= worker.worker_import_counter):
|
||||
break
|
||||
if time.time() - start_time > timeout * (num_warnings_sent + 1):
|
||||
if function_id.id() not in worker.functions:
|
||||
warning_message = "This worker was asked to execute a function that it does not have registered. You may have to restart Ray."
|
||||
else:
|
||||
warning_message = "This worker's import counter is too small."
|
||||
if not warning_sent:
|
||||
worker.push_error_to_driver(driver_id, "import_counter",
|
||||
warning_message)
|
||||
warning_sent = True
|
||||
elif worker.actor_id != NIL_ACTOR_ID and worker.actor_id in worker.actors:
|
||||
break
|
||||
if time.time() - start_time > timeout * (num_warnings_sent + 1):
|
||||
if function_id.id() not in worker.functions:
|
||||
warning_message = "This worker was asked to execute a function that it does not have registered. You may have to restart Ray."
|
||||
else:
|
||||
warning_message = "This worker's import counter is too small."
|
||||
if not warning_sent:
|
||||
worker.push_error_to_driver(driver_id, "import_counter",
|
||||
warning_message)
|
||||
warning_sent = True
|
||||
time.sleep(0.001)
|
||||
|
||||
def format_error_message(exception_message, task_exception=False):
|
||||
@@ -1530,6 +1608,7 @@ def main_loop(worker=global_worker):
|
||||
# correct driver.
|
||||
worker.task_driver_id = task.driver_id()
|
||||
worker.current_task_id = task.task_id()
|
||||
worker.current_function_id = task.function_id().id()
|
||||
worker.task_index = 0
|
||||
worker.put_index = 0
|
||||
function_id = task.function_id()
|
||||
@@ -1543,7 +1622,10 @@ def main_loop(worker=global_worker):
|
||||
|
||||
# Execute the task.
|
||||
with log_span("ray:task:execute", worker=worker):
|
||||
outputs = worker.functions[function_id.id()].executor(arguments)
|
||||
if task.actor_id().id() == NIL_ACTOR_ID:
|
||||
outputs = worker.functions[task.function_id().id()].executor(arguments)
|
||||
else:
|
||||
outputs = worker.functions[task.function_id().id()](worker.actors[task.actor_id().id()], *arguments)
|
||||
|
||||
# Store the outputs in the local object store.
|
||||
with log_span("ray:task:store_outputs", worker=worker):
|
||||
@@ -1557,8 +1639,12 @@ def main_loop(worker=global_worker):
|
||||
# occurred, we format the error message differently.
|
||||
# whether the variables "arguments" and "outputs" are defined.
|
||||
if "arguments" in locals() and "outputs" not in locals():
|
||||
# The error occurred during the task execution.
|
||||
traceback_str = format_error_message(traceback.format_exc(), task_exception=True)
|
||||
if task.actor_id().id() == NIL_ACTOR_ID:
|
||||
# The error occurred during the task execution.
|
||||
traceback_str = format_error_message(traceback.format_exc(), task_exception=True)
|
||||
else:
|
||||
# The error occurred during the execution of an actor task.
|
||||
traceback_str = format_error_message(traceback.format_exc())
|
||||
elif "arguments" in locals() and "outputs" in locals():
|
||||
# The error occurred after the task executed.
|
||||
traceback_str = format_error_message(traceback.format_exc())
|
||||
|
||||
@@ -6,6 +6,8 @@ import argparse
|
||||
import numpy as np
|
||||
import redis
|
||||
import traceback
|
||||
import sys
|
||||
import binascii
|
||||
|
||||
import ray
|
||||
|
||||
@@ -15,6 +17,7 @@ parser.add_argument("--redis-address", required=True, type=str, help="the addres
|
||||
parser.add_argument("--object-store-name", required=True, type=str, help="the object store's name")
|
||||
parser.add_argument("--object-store-manager-name", required=True, type=str, help="the object store manager's name")
|
||||
parser.add_argument("--local-scheduler-name", required=True, type=str, help="the local scheduler's name")
|
||||
parser.add_argument("--actor-id", required=False, type=str, help="the actor ID of this worker")
|
||||
|
||||
def random_string():
|
||||
return np.random.bytes(20)
|
||||
@@ -26,7 +29,10 @@ if __name__ == "__main__":
|
||||
"store_socket_name": args.object_store_name,
|
||||
"manager_socket_name": args.object_store_manager_name,
|
||||
"local_scheduler_socket_name": args.local_scheduler_name}
|
||||
ray.worker.connect(info, ray.WORKER_MODE)
|
||||
|
||||
actor_id = binascii.unhexlify(args.actor_id) if not args.actor_id is None else ray.worker.NIL_ACTOR_ID
|
||||
|
||||
ray.worker.connect(info, mode=ray.WORKER_MODE, actor_id=actor_id)
|
||||
|
||||
error_explanation = """
|
||||
This error is unexpected and should not have happened. Somehow a worker crashed
|
||||
|
||||
Reference in New Issue
Block a user