Refactor actor task queues (#1118)

* Refactor add_task_to_actor_queue into queue_actor_task and insert_actor_task_queue

* Refactor actor task queue to share the waiting task queue

* Fix
This commit is contained in:
Stephanie Wang
2017-10-13 20:52:11 -07:00
committed by Robert Nishihara
parent 79ea205b3e
commit 15486a14a0
6 changed files with 188 additions and 95 deletions
+10 -5
View File
@@ -155,6 +155,9 @@ def make_actor_method_executor(worker, method_name, method):
if not actor_checkpoint_failed:
put_dummy_object(worker, dummy_return_id)
worker.actor_task_counter = task_counter + 1
# Once the actor has resumed from a checkpoint, it counts as
# loaded.
worker.actor_loaded = True
# Report to the local scheduler whether this task succeeded in
# loading the checkpoint.
worker.actor_checkpoint_failed = actor_checkpoint_failed
@@ -168,6 +171,8 @@ def make_actor_method_executor(worker, method_name, method):
# case the method throws an exception.
put_dummy_object(worker, dummy_return_id)
worker.actor_task_counter = task_counter + 1
# Once the actor executes a task, it counts as loaded.
worker.actor_loaded = True
# Execute the actor method.
return method(actor, *args)
return actor_method_executor
@@ -408,9 +413,9 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
error_to_return = None
# Save or resume the checkpoint.
if previous_object_id in worker.actor_pinned_objects:
# The preceding task executed on this actor instance. Save the
# checkpoint.
if worker.actor_loaded:
# The actor has loaded, so we are running the normal execution.
# Save the checkpoint.
print("Saving actor checkpoint. actor_counter = {}."
.format(task_counter))
actor_key = b"Actor:" + worker.actor_id
@@ -437,8 +442,8 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
# so we still consider the task successful.
error_to_return = error
else:
# The preceding task has not yet executed on this actor
# instance. Try to resume from the most recent checkpoint.
# The actor has not yet loaded. Try loading it from the most
# recent checkpoint.
checkpoint_index, checkpoint = get_actor_checkpoint(
worker, worker.actor_id)
if checkpoint_index == task_counter:
+4
View File
@@ -227,6 +227,10 @@ class Worker(object):
self.make_actor = None
self.actors = {}
self.actor_task_counter = 0
# Whether an actor instance has been loaded yet. The actor counts as
# loaded once it has either executed its first task or successfully
# resumed from a checkpoint.
self.actor_loaded = False
# This field is used to report actor checkpoint failure for the last
# task assigned. Workers are not assigned a task on startup, so we
# initialize to False.