Leave ray.wait calls open until the task or actor exits (#5234)

* Regression test

* Split TaskDependencyManager::SubscribeDependencies into ray.get and ray.wait dependencies
- Some initial implementation

* unit test

* Improve unit tests for TaskDependencyManager

* Implement SubscribeWaitDependencies and UnsubscribeWaitDependencies, unit tests passing

* Add ray.wait python test for drivers that exit early

* Add WorkerID to Worker

* Update test to use two nodes

* Regression test for ray.wait passes

* Extend regression test to include ray.wait from an actor

* Fix ClientID and WorkerIDs

* lint

* lint

* Remove unnecessary ray_get argument

* fix build
This commit is contained in:
Stephanie Wang
2019-07-23 11:55:28 -07:00
committed by GitHub
parent a3d4f9f16d
commit 15959b0f0d
13 changed files with 493 additions and 118 deletions
+50 -1
View File
@@ -411,7 +411,7 @@ def test_driver_exiting_when_worker_blocked(call_ray_start):
ray.init(redis_address=redis_address)
# Define a driver that creates two tasks, one that runs forever and the
# other blocked on the first.
# other blocked on the first in a `ray.get`.
driver_script = """
import time
import ray
@@ -425,6 +425,30 @@ def g():
g.remote()
time.sleep(1)
print("success")
""".format(redis_address)
# Create some drivers and let them exit and make sure everything is
# still alive.
for _ in range(3):
out = run_string_as_driver(driver_script)
# Make sure the first driver ran to completion.
assert "success" in out
# Define a driver that creates two tasks, one that runs forever and the
# other blocked on the first in a `ray.wait`.
driver_script = """
import time
import ray
ray.init(redis_address="{}")
@ray.remote
def f():
time.sleep(10**6)
@ray.remote
def g():
ray.wait([f.remote()])
g.remote()
time.sleep(1)
print("success")
""".format(redis_address)
# Create some drivers and let them exit and make sure everything is
@@ -448,6 +472,31 @@ def g(x):
g.remote(ray.ObjectID(ray.utils.hex_to_binary("{}")))
time.sleep(1)
print("success")
""".format(redis_address, nonexistent_id_hex)
# Create some drivers and let them exit and make sure everything is
# still alive.
for _ in range(3):
out = run_string_as_driver(driver_script)
# Simulate the nonexistent dependency becoming available.
ray.worker.global_worker.put_object(
ray.ObjectID(nonexistent_id_bytes), None)
# Make sure the first driver ran to completion.
assert "success" in out
nonexistent_id_bytes = _random_string()
nonexistent_id_hex = ray.utils.binary_to_hex(nonexistent_id_bytes)
# Define a driver that calls `ray.wait` on a nonexistent object.
driver_script = """
import time
import ray
ray.init(redis_address="{}")
@ray.remote
def g():
ray.wait(ray.ObjectID(ray.utils.hex_to_binary("{}")))
g.remote()
time.sleep(1)
print("success")
""".format(redis_address, nonexistent_id_hex)
# Create some drivers and let them exit and make sure everything is