Separate thread locks for worker and function manager. (#4499)

* Separate lock for function manager and worker * Lint * Add test case * Remove print in remote function. * Remove test and add ray.exit_actor * Update python/ray/worker.py Co-Authored-By: guoyuhong <guoyuhong1985@outlook.com> * Move exit_actor from worker.py to actor.py * Update actor.py * Update actor.py
2026-06-28 12:28:10 +08:00 · 2019-04-29 14:55:37 +08:00
parent c578be23a5
commit 4eade036a0
4 changed files with 131 additions and 113 deletions
@@ -234,9 +234,14 @@ class Worker(object):
        Returns:
            The serialization context of the given driver.
        """
-        if driver_id not in self.serialization_context_map:
-            _initialize_serialization(driver_id)
-        return self.serialization_context_map[driver_id]
+        # This function needs to be proctected by a lock, because it will be
+        # called by`register_class_for_serialization`, as well as the import
+        # thread, from different threads. Also, this function will recursively
+        # call itself, so we use RLock here.
+        with self.lock:
+            if driver_id not in self.serialization_context_map:
+                _initialize_serialization(driver_id)
+            return self.serialization_context_map[driver_id]

    def check_connected(self):
        """Check if the worker is connected.
@@ -428,11 +433,7 @@ class Worker(object):
                # Wait a little bit for the import thread to import the class.
                # If we currently have the worker lock, we need to release it
                # so that the import thread can acquire it.
-                if self.mode == WORKER_MODE:
-                    self.lock.release()
                time.sleep(0.01)
-                if self.mode == WORKER_MODE:
-                    self.lock.acquire()

                if time.time() - start_time > error_timeout:
                    warning_message = ("This worker or driver is waiting to "
@@ -968,45 +969,37 @@ class Worker(object):
            driver_id, function_descriptor)

        # Execute the task.
-        # TODO(rkn): Consider acquiring this lock with a timeout and pushing a
-        # warning to the user if we are waiting too long to acquire the lock
-        # because that may indicate that the system is hanging, and it'd be
-        # good to know where the system is hanging.
-        with self.lock:
-            function_name = execution_info.function_name
-            extra_data = {
-                "name": function_name,
-                "task_id": task.task_id().hex()
-            }
-            if task.actor_id().is_nil():
-                if task.actor_creation_id().is_nil():
-                    title = "ray_worker:{}()".format(function_name)
-                    next_title = "ray_worker"
-                else:
-                    actor = self.actors[task.actor_creation_id()]
-                    title = "ray_{}:{}()".format(actor.__class__.__name__,
-                                                 function_name)
-                    next_title = "ray_{}".format(actor.__class__.__name__)
+        function_name = execution_info.function_name
+        extra_data = {"name": function_name, "task_id": task.task_id().hex()}
+        if task.actor_id().is_nil():
+            if task.actor_creation_id().is_nil():
+                title = "ray_worker:{}()".format(function_name)
+                next_title = "ray_worker"
            else:
-                actor = self.actors[task.actor_id()]
+                actor = self.actors[task.actor_creation_id()]
                title = "ray_{}:{}()".format(actor.__class__.__name__,
                                             function_name)
                next_title = "ray_{}".format(actor.__class__.__name__)
-            with profiling.profile("task", extra_data=extra_data):
-                with _changeproctitle(title, next_title):
-                    self._process_task(task, execution_info)
-                # Reset the state fields so the next task can run.
-                self.task_context.current_task_id = TaskID.nil()
-                self.task_context.task_index = 0
-                self.task_context.put_index = 1
-                if self.actor_id.is_nil():
-                    # Don't need to reset task_driver_id if the worker is an
-                    # actor. Because the following tasks should all have the
-                    # same driver id.
-                    self.task_driver_id = DriverID.nil()
-                    # Reset signal counters so that the next task can get
-                    # all past signals.
-                    ray_signal.reset()
+        else:
+            actor = self.actors[task.actor_id()]
+            title = "ray_{}:{}()".format(actor.__class__.__name__,
+                                         function_name)
+            next_title = "ray_{}".format(actor.__class__.__name__)
+        with profiling.profile("task", extra_data=extra_data):
+            with _changeproctitle(title, next_title):
+                self._process_task(task, execution_info)
+            # Reset the state fields so the next task can run.
+            self.task_context.current_task_id = TaskID.nil()
+            self.task_context.task_index = 0
+            self.task_context.put_index = 1
+            if self.actor_id.is_nil():
+                # Don't need to reset task_driver_id if the worker is an
+                # actor. Because the following tasks should all have the
+                # same driver id.
+                self.task_driver_id = DriverID.nil()
+                # Reset signal counters so that the next task can get
+                # all past signals.
+                ray_signal.reset()

        # Increase the task execution counter.
        self.function_actor_manager.increase_task_counter(
@@ -1645,10 +1638,9 @@ def listen_error_messages_raylet(worker, task_error_queue, threads_stopped):

    try:
        # Get the exports that occurred before the call to subscribe.
-        with worker.lock:
-            error_messages = global_state.error_messages(worker.task_driver_id)
-            for error_message in error_messages:
-                logger.error(error_message)
+        error_messages = global_state.error_messages(worker.task_driver_id)
+        for error_message in error_messages:
+            logger.error(error_message)

        while True:
            # Exit if we received a signal that we should stop.
@@ -1774,7 +1766,7 @@ def connect(node,
                traceback_str,
                driver_id=None)

-    worker.lock = threading.Lock()
+    worker.lock = threading.RLock()

    # Create an object for interfacing with the global state.
    global_state._initialize_global_state(