diff --git a/.travis.yml b/.travis.yml index cce1ee8df..a2963311c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -174,6 +174,11 @@ script: - python -m pytest -v --durations=10 test/cython_test.py - python -m pytest -v --durations=10 test/credis_test.py - python -m pytest -v --durations=10 test/node_manager_test.py + # TODO(yuhguo): object_manager_test.py requires a lot of CPU/memory, and + # better be put in Jenkins. However, it fails frequently in Jenkins, but + # works well in Travis. We should consider moving it back to Jenkins once + # we figure out the reason. + - python -m pytest -v --durations=10 test/object_manager_test.py # ray temp file tests - python -m pytest -v --durations=10 test/tempfile_test.py diff --git a/python/ray/function_manager.py b/python/ray/function_manager.py index 94ff22baf..659f18dec 100644 --- a/python/ray/function_manager.py +++ b/python/ray/function_manager.py @@ -444,7 +444,14 @@ class FunctionActorManager(object): # the function from GCS. with profiling.profile("wait_for_function", worker=self._worker): self._wait_for_function(function_descriptor, driver_id) - return self._function_execution_info[driver_id][function_id] + try: + info = self._function_execution_info[driver_id][function_id] + except KeyError as e: + message = ("Error occurs in get_execution_info: " + "driver_id: %s, function_descriptor: %s. Message: %s" % + (binary_to_hex(driver_id), function_descriptor, e)) + raise KeyError(message) + return info def _wait_for_function(self, function_descriptor, driver_id, timeout=10): """Wait until the function to be executed is present on this worker. @@ -509,7 +516,8 @@ class FunctionActorManager(object): def export_actor_class(self, Class, actor_method_names, checkpoint_interval): function_descriptor = FunctionDescriptor.from_class(Class) - key = b"ActorClass:" + function_descriptor.function_id.id() + key = (b"ActorClass:" + self._worker.task_driver_id.id() + b":" + + function_descriptor.function_id.id()) actor_class_info = { "class_name": Class.__name__, "module": Class.__module__, @@ -539,7 +547,8 @@ class FunctionActorManager(object): # because of https://github.com/ray-project/ray/issues/1146. def load_actor(self, driver_id, function_descriptor): - key = b"ActorClass:" + function_descriptor.function_id.id() + key = (b"ActorClass:" + driver_id + b":" + + function_descriptor.function_id.id()) # Wait for the actor class key to have been imported by the # import thread. TODO(rkn): It shouldn't be possible to end # up in an infinite loop here, but we should push an error to diff --git a/test/jenkins_tests/run_multi_node_tests.sh b/test/jenkins_tests/run_multi_node_tests.sh index 403b9e58c..06a927b18 100755 --- a/test/jenkins_tests/run_multi_node_tests.sh +++ b/test/jenkins_tests/run_multi_node_tests.sh @@ -427,8 +427,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ --stop '{"training_iteration": 2}' \ --config '{"num_workers": 2, "use_pytorch": true, "sample_async": false}' -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA python -m pytest -v /ray/test/object_manager_test.py - python3 $ROOT_DIR/multi_node_docker_test.py \ --docker-image=$DOCKER_SHA \ --num-nodes=5 \