Dynamically grow worker pool to partially solve hanging workloads (#286)

* First pass at a policy to solve deadlock

* Address Robert's comments

* stress test

* unit test

* Fix test cases

* Fix test for python3

* add more logging

* White space.
This commit is contained in:
Stephanie Wang
2017-02-17 17:08:52 -08:00
committed by Robert Nishihara
parent 0bbf08a4ac
commit a0dd3a44c0
11 changed files with 393 additions and 38 deletions
+6
View File
@@ -471,6 +471,7 @@ class Worker(object):
# their original index in the object_ids argument.
unready_ids = dict((object_id, i) for (i, (object_id, val)) in
enumerate(final_results) if val is None)
was_blocked = (len(unready_ids) > 0)
# Try reconstructing any objects we haven't gotten yet. Try to get them
# until GET_TIMEOUT_MILLISECONDS milliseconds passes, then repeat.
while len(unready_ids) > 0:
@@ -487,6 +488,11 @@ class Worker(object):
final_results[index] = (object_id, val)
unready_ids.pop(object_id)
# If there were objects that we weren't able to get locally, let the local
# scheduler know that we're now unblocked.
if was_blocked:
self.photon_client.notify_unblocked()
# Unwrap the object from the list (it was wrapped put_object).
assert len(final_results) == len(object_ids)
for i in range(len(final_results)):