[Core] Error info pubsub (Remove ray.errors API) (#9665)

2026-07-05 01:42:33 +08:00 · 2020-08-04 14:04:29 +08:00
parent 8c3fc1db76
commit 28b1f7710c
39 changed files with 312 additions and 602 deletions
@@ -77,7 +77,7 @@ _config = _Config()
 from ray.profiling import profile  # noqa: E402
 from ray.state import (jobs, nodes, actors, objects, timeline,
                       object_transfer_timeline, cluster_resources,
-                       available_resources, errors)  # noqa: E402
+                       available_resources)  # noqa: E402
 from ray.worker import (
    LOCAL_MODE,
    SCRIPT_MODE,
@@ -122,7 +122,6 @@ __all__ = [
    "object_transfer_timeline",
    "cluster_resources",
    "available_resources",
-    "errors",
    "LOCAL_MODE",
    "PYTHON_MODE",
    "SCRIPT_MODE",
@@ -175,8 +175,8 @@ class NodeStats(threading.Thread):
        p.subscribe(log_channel)
        logger.info("NodeStats: subscribed to {}".format(log_channel))

-        error_channel = ray.gcs_utils.TablePubsub.Value("ERROR_INFO_PUBSUB")
-        p.subscribe(error_channel)
+        error_channel = ray.gcs_utils.RAY_ERROR_PUBSUB_PATTERN
+        p.psubscribe(error_channel)
        logger.info("NodeStats: subscribed to {}".format(error_channel))

        actor_channel = ray.gcs_utils.RAY_ACTOR_PUBSUB_PATTERN
@@ -211,9 +211,10 @@ class NodeStats(threading.Thread):
                        pid = str(data["pid"])
                        self._logs[ip][pid].extend(data["lines"])
                    elif channel == str(error_channel):
-                        gcs_entry = ray.gcs_utils.GcsEntry.FromString(data)
+                        pubsub_msg = ray.gcs_utils.PubSubMessage.FromString(
+                            data)
                        error_data = ray.gcs_utils.ErrorTableData.FromString(
-                            gcs_entry.entries[0])
+                            pubsub_msg.data)
                        message = error_data.error_message
                        message = re.sub(r"\x1b\[\d+m", "", message)
                        match = re.search(r"\(pid=(\d+), ip=(.*?)\)", message)
@@ -60,13 +60,14 @@ RAY_ACTOR_PUBSUB_PATTERN = "ACTOR:*".encode("ascii")
 # Reporter pub/sub updates
 RAY_REPORTER_PUBSUB_PATTERN = "RAY_REPORTER.*".encode("ascii")

+RAY_ERROR_PUBSUB_PATTERN = "ERROR_INFO:*".encode("ascii")
+
 # These prefixes must be kept up-to-date with the TablePrefix enum in
 # gcs.proto.
 # TODO(rkn): We should use scoped enums, in which case we should be able to
 # just access the flatbuffer generated values.
 TablePrefix_RAYLET_TASK_string = "RAYLET_TASK"
 TablePrefix_OBJECT_string = "OBJECT"
-TablePrefix_ERROR_INFO_string = "ERROR_INFO"
 TablePrefix_PROFILE_string = "PROFILE"
 TablePrefix_JOB_string = "JOB"
 TablePrefix_ACTOR_string = "ACTOR"
@@ -760,67 +760,6 @@ class GlobalState:

        return dict(total_available_resources)

-    def _error_messages(self, job_id):
-        """Get the error messages for a specific driver.
-
-        Args:
-            job_id: The ID of the job to get the errors for.
-
-        Returns:
-            A list of the error messages for this driver.
-        """
-        assert isinstance(job_id, ray.JobID)
-        message = self.redis_client.execute_command(
-            "RAY.TABLE_LOOKUP", gcs_utils.TablePrefix.Value("ERROR_INFO"), "",
-            job_id.binary())
-
-        # If there are no errors, return early.
-        if message is None:
-            return []
-
-        gcs_entries = gcs_utils.GcsEntry.FromString(message)
-        error_messages = []
-        for entry in gcs_entries.entries:
-            error_data = gcs_utils.ErrorTableData.FromString(entry)
-            assert job_id.binary() == error_data.job_id
-            error_message = {
-                "type": error_data.type,
-                "message": error_data.error_message,
-                "timestamp": error_data.timestamp,
-            }
-            error_messages.append(error_message)
-        return error_messages
-
-    def error_messages(self, job_id=None):
-        """Get the error messages for all drivers or a specific driver.
-
-        Args:
-            job_id: The specific job to get the errors for. If this is
-                None, then this method retrieves the errors for all jobs.
-
-        Returns:
-            A list of the error messages for the specified driver if one was
-                given, or a dictionary mapping from job ID to a list of error
-                messages for that driver otherwise.
-        """
-        self._check_connected()
-
-        if job_id is not None:
-            assert isinstance(job_id, ray.JobID)
-            return self._error_messages(job_id)
-
-        error_table_keys = self.redis_client.keys(
-            gcs_utils.TablePrefix_ERROR_INFO_string + "*")
-        job_ids = [
-            key[len(gcs_utils.TablePrefix_ERROR_INFO_string):]
-            for key in error_table_keys
-        ]
-
-        return {
-            binary_to_hex(job_id): self._error_messages(ray.JobID(job_id))
-            for job_id in job_ids
-        }
-
    def actor_checkpoint_info(self, actor_id):
        """Get checkpoint info for the given actor id.
         Args:
@@ -1001,24 +940,3 @@ def available_resources():
            resource in the cluster.
    """
    return state.available_resources()
-
-
-def errors(all_jobs=False):
-    """Get error messages from the cluster.
-
-    Args:
-        all_jobs: False if we should only include error messages for this
-            specific job, or True if we should include error messages for all
-            jobs.
-
-    Returns:
-        Error messages pushed from the cluster. This will be a single list if
-            all_jobs is False, or a dictionary mapping from job ID to a list of
-            error messages for that job if all_jobs is True.
-    """
-    if not all_jobs:
-        worker = ray.worker.global_worker
-        error_messages = state.error_messages(job_id=worker.current_job_id)
-    else:
-        error_messages = state.error_messages(job_id=None)
-    return error_messages
@@ -208,17 +208,6 @@ def run_string_as_driver_nonblocking(driver_script):
    return proc


-def flat_errors():
-    errors = []
-    for job_errors in ray.errors(all_jobs=True).values():
-        errors.extend(job_errors)
-    return errors
-
-
-def relevant_errors(error_type):
-    return [error for error in flat_errors() if error["type"] == error_type]
-
-
 def wait_for_num_actors(num_actors, timeout=10):
    start_time = time.time()
    while time.time() - start_time < timeout:
@@ -228,16 +217,6 @@ def wait_for_num_actors(num_actors, timeout=10):
    raise RayTestTimeoutException("Timed out while waiting for global state.")


-def wait_for_errors(error_type, num_errors, timeout=20):
-    start_time = time.time()
-    while time.time() - start_time < timeout:
-        if len(relevant_errors(error_type)) >= num_errors:
-            return
-        time.sleep(0.1)
-    raise RayTestTimeoutException("Timed out waiting for {} {} errors.".format(
-        num_errors, error_type))
-
-
 def wait_for_condition(condition_predictor, timeout=30, retry_interval_ms=100):
    """Wait until a condition is met or time out with an exception.

@@ -404,3 +383,31 @@ def get_other_nodes(cluster, exclude_head=False):
 def get_non_head_nodes(cluster):
    """Get all non-head nodes."""
    return list(filter(lambda x: x.head is False, cluster.list_all_nodes()))
+
+
+def init_error_pubsub():
+    """Initialize redis error info pub/sub"""
+    p = ray.worker.global_worker.redis_client.pubsub(
+        ignore_subscribe_messages=True)
+    error_pubsub_channel = ray.gcs_utils.RAY_ERROR_PUBSUB_PATTERN
+    p.psubscribe(error_pubsub_channel)
+    return p
+
+
+def get_error_message(pub_sub, num, error_type=None, timeout=10):
+    """Get errors through pub/sub."""
+    start_time = time.time()
+    msgs = []
+    while time.time() - start_time < timeout and len(msgs) < num:
+        msg = pub_sub.get_message()
+        if msg is None:
+            time.sleep(0.01)
+            continue
+        pubsub_msg = ray.gcs_utils.PubSubMessage.FromString(msg["data"])
+        error_data = ray.gcs_utils.ErrorTableData.FromString(pubsub_msg.data)
+        if error_type is None or error_type == error_data.type:
+            msgs.append(error_data)
+        else:
+            time.sleep(0.01)
+
+    return msgs
@@ -9,6 +9,7 @@ import subprocess

 import ray
 from ray.cluster_utils import Cluster
+from ray.test_utils import init_error_pubsub


@pytest.fixture
@@ -209,3 +210,10 @@ def two_node_cluster():
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
+
+
+@pytest.fixture()
+def error_pubsub():
+    p = init_error_pubsub()
+    yield p
+    p.close()
@@ -12,13 +12,12 @@ import ray.ray_constants as ray_constants
 import ray.test_utils
 import ray.cluster_utils
 from ray.test_utils import (
-    relevant_errors,
    wait_for_condition,
-    wait_for_errors,
    wait_for_pid_to_exit,
    generate_internal_config_map,
    get_other_nodes,
    SignalActor,
+    get_error_message,
 )

 SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM
@@ -627,10 +626,12 @@ def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,


@pytest.mark.skip(reason="TODO: Actor checkpointing")
-def test_checkpointing_save_exception(ray_start_regular,
+def test_checkpointing_save_exception(ray_start_regular, error_pubsub,
                                      ray_checkpointable_actor_cls):
    """Test actor can still be recovered if checkpoints fail to complete."""

+    p = error_pubsub
+
    @ray.remote(max_restarts=2)
    class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
        def save_checkpoint(self, actor_id, checkpoint_context):
@@ -663,14 +664,18 @@ def test_checkpointing_save_exception(ray_start_regular,
    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False

    # Check that the checkpoint error was pushed to the driver.
-    wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1)
+    errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR


@pytest.mark.skip(reason="TODO: Actor checkpointing")
-def test_checkpointing_load_exception(ray_start_regular,
+def test_checkpointing_load_exception(ray_start_regular, error_pubsub,
                                      ray_checkpointable_actor_cls):
    """Test actor can still be recovered if checkpoints fail to load."""

+    p = error_pubsub
+
    @ray.remote(max_restarts=2)
    class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
        def load_checkpoint(self, actor_id, checkpoints):
@@ -704,7 +709,9 @@ def test_checkpointing_load_exception(ray_start_regular,
    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False

    # Check that the checkpoint error was pushed to the driver.
-    wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1)
+    errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR


@pytest.mark.parametrize(
@@ -759,14 +766,16 @@ def test_bad_checkpointable_actor_class():
                return True


-def test_init_exception_in_checkpointable_actor(ray_start_regular,
-                                                ray_checkpointable_actor_cls):
+def test_init_exception_in_checkpointable_actor(
+        ray_start_regular, error_pubsub, ray_checkpointable_actor_cls):
    # This test is similar to test_failure.py::test_failed_actor_init.
    # This test is used to guarantee that checkpointable actor does not
    # break the same logic.
    error_message1 = "actor constructor failed"
    error_message2 = "actor method failed"

+    p = error_pubsub
+
    @ray.remote
    class CheckpointableFailedActor(ray_checkpointable_actor_cls):
        def __init__(self):
@@ -781,17 +790,15 @@ def test_init_exception_in_checkpointable_actor(ray_start_regular,
    a = CheckpointableFailedActor.remote()

    # Make sure that we get errors from a failed constructor.
-    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
-    errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
+    errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
    assert len(errors) == 1
-    assert error_message1 in errors[0]["message"]
+    assert error_message1 in errors[0].error_message

    # Make sure that we get errors from a failed method.
    a.fail_method.remote()
-    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
-    errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
-    assert len(errors) == 2
-    assert error_message1 in errors[1]["message"]
+    errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
+    assert len(errors) == 1
+    assert error_message1 in errors[0].error_message


 def test_decorated_method(ray_start_regular):
@@ -349,31 +349,6 @@ def test_ray_setproctitle(ray_start_2_cpus):
    ray.get(unique_1.remote())


-def test_duplicate_error_messages(shutdown_only):
-    ray.init(num_cpus=0)
-
-    driver_id = ray.WorkerID.nil()
-    error_data = ray.gcs_utils.construct_error_message(driver_id, "test",
-                                                       "message", 0)
-
-    # Push the same message to the GCS twice (they are the same because we
-    # do not include a timestamp).
-
-    r = ray.worker.global_worker.redis_client
-
-    r.execute_command("RAY.TABLE_APPEND",
-                      ray.gcs_utils.TablePrefix.Value("ERROR_INFO"),
-                      ray.gcs_utils.TablePubsub.Value("ERROR_INFO_PUBSUB"),
-                      driver_id.binary(), error_data)
-
-    # Before https://github.com/ray-project/ray/pull/3316 this would
-    # give an error
-    r.execute_command("RAY.TABLE_APPEND",
-                      ray.gcs_utils.TablePrefix.Value("ERROR_INFO"),
-                      ray.gcs_utils.TablePubsub.Value("ERROR_INFO_PUBSUB"),
-                      driver_id.binary(), error_data)
-
-
@pytest.mark.skipif(
    os.getenv("TRAVIS") is None,
    reason="This test should only be run on Travis.")
@@ -14,15 +14,14 @@ import ray
 import ray.ray_constants as ray_constants
 from ray.cluster_utils import Cluster
 from ray.test_utils import (
-    relevant_errors,
    wait_for_condition,
-    wait_for_errors,
-    RayTestTimeoutException,
    SignalActor,
+    init_error_pubsub,
+    get_error_message,
 )


-def test_failed_task(ray_start_regular):
+def test_failed_task(ray_start_regular, error_pubsub):
    @ray.remote
    def throw_exception_fct1():
        raise Exception("Test function 1 intentionally failed.")
@@ -35,13 +34,15 @@ def test_failed_task(ray_start_regular):
    def throw_exception_fct3(x):
        raise Exception("Test function 3 intentionally failed.")

+    p = error_pubsub
+
    throw_exception_fct1.remote()
    throw_exception_fct1.remote()
-    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
-    assert len(relevant_errors(ray_constants.TASK_PUSH_ERROR)) == 2
-    for task in relevant_errors(ray_constants.TASK_PUSH_ERROR):
-        msg = task.get("message")
-        assert "Test function 1 intentionally failed." in msg
+
+    msgs = get_error_message(p, 2, ray_constants.TASK_PUSH_ERROR)
+    assert len(msgs) == 2
+    for msg in msgs:
+        assert "Test function 1 intentionally failed." in msg.error_message

    x = throw_exception_fct2.remote()
    try:
@@ -120,7 +121,8 @@ def test_get_throws_quickly_when_found_exception(ray_start_regular):
    ray.get(signal2.send.remote())


-def test_fail_importing_remote_function(ray_start_2_cpus):
+def test_fail_importing_remote_function(ray_start_2_cpus, error_pubsub):
+    p = error_pubsub
    # Create the contents of a temporary Python file.
    temporary_python_file = """
 def temporary_helper_function():
@@ -150,11 +152,11 @@ def temporary_helper_function():
    # Invoke the function so that the definition is exported.
    g.remote(1, y=2)

-    wait_for_errors(ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR, 2)
-    errors = relevant_errors(ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR)
-    assert len(errors) >= 2, errors
-    assert "No module named" in errors[0]["message"]
-    assert "No module named" in errors[1]["message"]
+    errors = get_error_message(
+        p, 2, ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR)
+    assert errors[0].type == ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR
+    assert "No module named" in errors[0].error_message
+    assert "No module named" in errors[1].error_message

    # Check that if we try to call the function it throws an exception and
    # does not hang.
@@ -164,26 +166,28 @@ def temporary_helper_function():
            ray.get(g.remote(1, y=2))

    f.close()
-
    # Clean up the junk we added to sys.path.
    sys.path.pop(-1)


-def test_failed_function_to_run(ray_start_2_cpus):
+def test_failed_function_to_run(ray_start_2_cpus, error_pubsub):
+    p = error_pubsub
+
    def f(worker):
        if ray.worker.global_worker.mode == ray.WORKER_MODE:
            raise Exception("Function to run failed.")

    ray.worker.global_worker.run_function_on_all_workers(f)
-    wait_for_errors(ray_constants.FUNCTION_TO_RUN_PUSH_ERROR, 2)
    # Check that the error message is in the task info.
-    errors = relevant_errors(ray_constants.FUNCTION_TO_RUN_PUSH_ERROR)
+    errors = get_error_message(p, 2, ray_constants.FUNCTION_TO_RUN_PUSH_ERROR)
    assert len(errors) == 2
-    assert "Function to run failed." in errors[0]["message"]
-    assert "Function to run failed." in errors[1]["message"]
+    assert errors[0].type == ray_constants.FUNCTION_TO_RUN_PUSH_ERROR
+    assert "Function to run failed." in errors[0].error_message
+    assert "Function to run failed." in errors[1].error_message


-def test_fail_importing_actor(ray_start_regular):
+def test_fail_importing_actor(ray_start_regular, error_pubsub):
+    p = error_pubsub
    # Create the contents of a temporary Python file.
    temporary_python_file = """
 def temporary_helper_function():
@@ -210,21 +214,22 @@ def temporary_helper_function():
            return 1

    # There should be no errors yet.
-    assert len(ray.errors()) == 0
-
+    errors = get_error_message(p, 2)
+    assert len(errors) == 0
    # Create an actor.
    foo = Foo.remote(3, arg2=0)

-    # Wait for the error to arrive.
-    wait_for_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR, 1)
-    errors = relevant_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR)
-    assert "No module named" in errors[0]["message"]
+    errors = get_error_message(p, 2)
+    assert len(errors) == 2

-    # Wait for the error from when the __init__ tries to run.
-    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
-    errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
-    assert ("failed to be imported, and so cannot execute this method" in
-            errors[0]["message"])
+    for error in errors:
+        # Wait for the error to arrive.
+        if error.type == ray_constants.REGISTER_ACTOR_PUSH_ERROR:
+            assert "No module named" in error.error_message
+        else:
+            # Wait for the error from when the __init__ tries to run.
+            assert ("failed to be imported, and so cannot execute this method"
+                    in error.error_message)

    # Check that if we try to get the function it throws an exception and
    # does not hang.
@@ -232,10 +237,11 @@ def temporary_helper_function():
        ray.get(foo.get_val.remote(1, arg2=2))

    # Wait for the error from when the call to get_val.
-    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
-    errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
+    errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.TASK_PUSH_ERROR
    assert ("failed to be imported, and so cannot execute this method" in
-            errors[1]["message"])
+            errors[0].error_message)

    f.close()

@@ -243,7 +249,8 @@ def temporary_helper_function():
    sys.path.pop(-1)


-def test_failed_actor_init(ray_start_regular):
+def test_failed_actor_init(ray_start_regular, error_pubsub):
+    p = error_pubsub
    error_message1 = "actor constructor failed"
    error_message2 = "actor method failed"

@@ -258,20 +265,21 @@ def test_failed_actor_init(ray_start_regular):
    a = FailedActor.remote()

    # Make sure that we get errors from a failed constructor.
-    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
-    errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
+    errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
    assert len(errors) == 1
-    assert error_message1 in errors[0]["message"]
+    assert errors[0].type == ray_constants.TASK_PUSH_ERROR
+    assert error_message1 in errors[0].error_message

    # Make sure that we get errors from a failed method.
    a.fail_method.remote()
-    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
-    errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
-    assert len(errors) == 2
-    assert error_message1 in errors[1]["message"]
+    errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.TASK_PUSH_ERROR
+    assert error_message1 in errors[0].error_message


-def test_failed_actor_method(ray_start_regular):
+def test_failed_actor_method(ray_start_regular, error_pubsub):
+    p = error_pubsub
    error_message2 = "actor method failed"

    @ray.remote
@@ -286,10 +294,10 @@ def test_failed_actor_method(ray_start_regular):

    # Make sure that we get errors from a failed method.
    a.fail_method.remote()
-    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
-    errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
+    errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
    assert len(errors) == 1
-    assert error_message2 in errors[0]["message"]
+    assert errors[0].type == ray_constants.TASK_PUSH_ERROR
+    assert error_message2 in errors[0].error_message


 def test_incorrect_method_calls(ray_start_regular):
@@ -328,7 +336,9 @@ def test_incorrect_method_calls(ray_start_regular):
        a.nonexistent_method.remote()


-def test_worker_raising_exception(ray_start_regular):
+def test_worker_raising_exception(ray_start_regular, error_pubsub):
+    p = error_pubsub
+
    @ray.remote(max_calls=2)
    def f():
        # This is the only reasonable variable we can set here that makes the
@@ -339,12 +349,15 @@ def test_worker_raising_exception(ray_start_regular):
    # Running this task should cause the worker to raise an exception after
    # the task has successfully completed.
    f.remote()
-
-    wait_for_errors(ray_constants.WORKER_CRASH_PUSH_ERROR, 1)
+    errors = get_error_message(p, 1, ray_constants.WORKER_CRASH_PUSH_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.WORKER_CRASH_PUSH_ERROR


-def test_worker_dying(ray_start_regular):
+def test_worker_dying(ray_start_regular, error_pubsub):
+    p = error_pubsub
    # Define a remote function that will kill the worker that runs it.
+
    @ray.remote(max_retries=0)
    def f():
        eval("exit()")
@@ -352,14 +365,15 @@ def test_worker_dying(ray_start_regular):
    with pytest.raises(ray.exceptions.RayWorkerError):
        ray.get(f.remote())

-    wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
-
-    errors = relevant_errors(ray_constants.WORKER_DIED_PUSH_ERROR)
+    errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR)
    assert len(errors) == 1
-    assert "died or was killed while executing" in errors[0]["message"]
+    assert errors[0].type == ray_constants.WORKER_DIED_PUSH_ERROR
+    assert "died or was killed while executing" in errors[0].error_message


-def test_actor_worker_dying(ray_start_regular):
+def test_actor_worker_dying(ray_start_regular, error_pubsub):
+    p = error_pubsub
+
    @ray.remote
    class Actor:
        def kill(self):
@@ -375,10 +389,14 @@ def test_actor_worker_dying(ray_start_regular):
        ray.get(obj)
    with pytest.raises(ray.exceptions.RayTaskError):
        ray.get(consume.remote(obj))
-    wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
+    errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.WORKER_DIED_PUSH_ERROR


-def test_actor_worker_dying_future_tasks(ray_start_regular):
+def test_actor_worker_dying_future_tasks(ray_start_regular, error_pubsub):
+    p = error_pubsub
+
    @ray.remote(max_restarts=0)
    class Actor:
        def getpid(self):
@@ -397,7 +415,9 @@ def test_actor_worker_dying_future_tasks(ray_start_regular):
        with pytest.raises(Exception):
            ray.get(obj)

-    wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
+    errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.WORKER_DIED_PUSH_ERROR


 def test_actor_worker_dying_nothing_in_progress(ray_start_regular):
@@ -415,7 +435,10 @@ def test_actor_worker_dying_nothing_in_progress(ray_start_regular):
        ray.get(task2)


-def test_actor_scope_or_intentionally_killed_message(ray_start_regular):
+def test_actor_scope_or_intentionally_killed_message(ray_start_regular,
+                                                     error_pubsub):
+    p = error_pubsub
+
    @ray.remote
    class Actor:
        pass
@@ -424,15 +447,16 @@ def test_actor_scope_or_intentionally_killed_message(ray_start_regular):
    a = Actor.remote()
    a.__ray_terminate__.remote()
    time.sleep(1)
-    assert len(
-        ray.errors()) == 0, ("Should not have propogated an error - {}".format(
-            ray.errors()))
+    errors = get_error_message(p, 1)
+    assert len(errors) == 0, "Should not have propogated an error - {}".format(
+        errors)


@pytest.mark.skip("This test does not work yet.")
@pytest.mark.parametrize(
    "ray_start_object_store_memory", [10**6], indirect=True)
-def test_put_error1(ray_start_object_store_memory):
+def test_put_error1(ray_start_object_store_memory, error_pubsub):
+    p = error_pubsub
    num_objects = 3
    object_size = 4 * 10**5

@@ -470,7 +494,10 @@ def test_put_error1(ray_start_object_store_memory):
    put_arg_task.remote()

    # Make sure we receive the correct error message.
-    wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1)
+    errors = get_error_message(p, 1,
+                               ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR


@pytest.mark.skip("This test does not work yet.")
@@ -514,22 +541,28 @@ def test_put_error2(ray_start_object_store_memory):
    put_task.remote()

    # Make sure we receive the correct error message.
-    wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1)
+    # get_error_message(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1)


-def test_version_mismatch(shutdown_only):
+@pytest.mark.skip("Publish happeds before we subscribe it")
+def test_version_mismatch(error_pubsub, shutdown_only):
    ray_version = ray.__version__
    ray.__version__ = "fake ray version"

    ray.init(num_cpus=1)
+    p = error_pubsub

-    wait_for_errors(ray_constants.VERSION_MISMATCH_PUSH_ERROR, 1)
+    errors = get_error_message(p, 1, ray_constants.VERSION_MISMATCH_PUSH_ERROR)
+    assert False, errors
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.VERSION_MISMATCH_PUSH_ERROR

    # Reset the version.
    ray.__version__ = ray_version


-def test_export_large_objects(ray_start_regular):
+def test_export_large_objects(ray_start_regular, error_pubsub):
+    p = error_pubsub
    import ray.ray_constants as ray_constants

    large_object = np.zeros(2 * ray_constants.PICKLE_OBJECT_WARNING_SIZE)
@@ -542,7 +575,10 @@ def test_export_large_objects(ray_start_regular):
    f.remote()

    # Make sure that a warning is generated.
-    wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 1)
+    errors = get_error_message(p, 1,
+                               ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR

    @ray.remote
    class Foo:
@@ -552,11 +588,15 @@ def test_export_large_objects(ray_start_regular):
    Foo.remote()

    # Make sure that a warning is generated.
-    wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 2)
+    errors = get_error_message(p, 1,
+                               ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR


@pytest.mark.skip(reason="TODO detect resource deadlock")
-def test_warning_for_resource_deadlock(shutdown_only):
+def test_warning_for_resource_deadlock(error_pubsub, shutdown_only):
+    p = error_pubsub
    # Check that we get warning messages for infeasible tasks.
    ray.init(num_cpus=1)

@@ -574,10 +614,13 @@ def test_warning_for_resource_deadlock(shutdown_only):

    # Run in a task to check we handle the blocked task case correctly
    f.remote()
-    wait_for_errors(ray_constants.RESOURCE_DEADLOCK_ERROR, 1, timeout=30)
+    errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR


-def test_warning_for_infeasible_tasks(ray_start_regular):
+def test_warning_for_infeasible_tasks(ray_start_regular, error_pubsub):
+    p = error_pubsub
    # Check that we get warning messages for infeasible tasks.

    @ray.remote(num_gpus=1)
@@ -590,11 +633,15 @@ def test_warning_for_infeasible_tasks(ray_start_regular):

    # This task is infeasible.
    f.remote()
-    wait_for_errors(ray_constants.INFEASIBLE_TASK_ERROR, 1)
+    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR

    # This actor placement task is infeasible.
    Foo.remote()
-    wait_for_errors(ray_constants.INFEASIBLE_TASK_ERROR, 2)
+    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR


 def test_warning_for_infeasible_zero_cpu_actor(shutdown_only):
@@ -603,6 +650,7 @@ def test_warning_for_infeasible_zero_cpu_actor(shutdown_only):
    # requires no CPUs).

    ray.init(num_cpus=0)
+    p = init_error_pubsub()

    @ray.remote
    class Foo:
@@ -610,7 +658,10 @@ def test_warning_for_infeasible_zero_cpu_actor(shutdown_only):

    # The actor creation should be infeasible.
    Foo.remote()
-    wait_for_errors(ray_constants.INFEASIBLE_TASK_ERROR, 1)
+    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR
+    p.close()


 def test_warning_for_too_many_actors(shutdown_only):
@@ -619,15 +670,23 @@ def test_warning_for_too_many_actors(shutdown_only):
    num_cpus = 2
    ray.init(num_cpus=num_cpus)

+    p = init_error_pubsub()
+
    @ray.remote
    class Foo:
        def __init__(self):
            time.sleep(1000)

    [Foo.remote() for _ in range(num_cpus * 3)]
-    wait_for_errors(ray_constants.WORKER_POOL_LARGE_ERROR, 1)
+    errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR
+
    [Foo.remote() for _ in range(num_cpus)]
-    wait_for_errors(ray_constants.WORKER_POOL_LARGE_ERROR, 2)
+    errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR
+    p.close()


 def test_warning_for_too_many_nested_tasks(shutdown_only):
@@ -635,6 +694,7 @@ def test_warning_for_too_many_nested_tasks(shutdown_only):
    # started that we will receive a warning.
    num_cpus = 2
    ray.init(num_cpus=num_cpus)
+    p = init_error_pubsub()

    @ray.remote
    def f():
@@ -654,7 +714,10 @@ def test_warning_for_too_many_nested_tasks(shutdown_only):
        ray.get(h.remote())

    [g.remote() for _ in range(num_cpus * 4)]
-    wait_for_errors(ray_constants.WORKER_POOL_LARGE_ERROR, 1)
+    errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
+    assert len(errors) == 1
+    assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR
+    p.close()


 def test_warning_for_many_duplicate_remote_functions_and_actors(shutdown_only):
@@ -786,9 +849,10 @@ def test_redis_module_failure(ray_start_regular):

 # Note that this test will take at least 10 seconds because it must wait for
 # the monitor to detect enough missed heartbeats.
-def test_warning_for_dead_node(ray_start_cluster_2_nodes):
+def test_warning_for_dead_node(ray_start_cluster_2_nodes, error_pubsub):
    cluster = ray_start_cluster_2_nodes
    cluster.wait_for_nodes()
+    p = error_pubsub

    node_ids = {item["NodeID"] for item in ray.nodes()}

@@ -801,14 +865,11 @@ def test_warning_for_dead_node(ray_start_cluster_2_nodes):
    cluster.list_all_nodes()[0].kill_raylet()

    # Check that we get warning messages for both raylets.
-    wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2, timeout=40)
+    errors = get_error_message(p, 2, ray_constants.REMOVED_NODE_ERROR, 40)

    # Extract the client IDs from the error messages. This will need to be
    # changed if the error message changes.
-    warning_node_ids = {
-        item["message"].split(" ")[5]
-        for item in relevant_errors(ray_constants.REMOVED_NODE_ERROR)
-    }
+    warning_node_ids = {error.error_message.split(" ")[5] for error in errors}

    assert node_ids == warning_node_ids

@@ -837,24 +898,28 @@ def test_connect_with_disconnected_node(shutdown_only):
    cluster = Cluster()
    cluster.add_node(num_cpus=0, _internal_config=config)
    ray.init(address=cluster.address)
-    info = relevant_errors(ray_constants.REMOVED_NODE_ERROR)
-    assert len(info) == 0
+    p = init_error_pubsub()
+    errors = get_error_message(p, 1, timeout=5)
+    assert len(errors) == 0
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0)
    cluster.remove_node(dead_node, allow_graceful=False)
-    wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 1)
+    errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR)
+    assert len(errors) == 1
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0)
    cluster.remove_node(dead_node, allow_graceful=False)
-    wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2)
+    errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR)
+    assert len(errors) == 1
    # This node is killed by SIGTERM, ray_monitor will not mark it again.
    removing_node = cluster.add_node(num_cpus=0)
    cluster.remove_node(removing_node, allow_graceful=True)
-    with pytest.raises(RayTestTimeoutException):
-        wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 3, timeout=2)
+    errors = get_error_message(p, 1, timeout=2)
+    assert len(errors) == 0
    # There is no connection error to a dead node.
-    info = relevant_errors(ray_constants.RAYLET_CONNECTION_ERROR)
-    assert len(info) == 0
+    errors = get_error_message(p, 1, timeout=2)
+    assert len(errors) == 0
+    p.close()


@pytest.mark.parametrize(
@@ -8,16 +8,19 @@ import ray
 from ray.test_utils import (
    RayTestTimeoutException, check_call_ray, run_string_as_driver,
    run_string_as_driver_nonblocking, wait_for_children_of_pid,
-    wait_for_children_of_pid_to_exit, kill_process_by_name, Semaphore)
+    wait_for_children_of_pid_to_exit, kill_process_by_name, Semaphore,
+    init_error_pubsub, get_error_message)


 def test_error_isolation(call_ray_start):
    address = call_ray_start
    # Connect a driver to the Ray cluster.
    ray.init(address=address)
+    p = init_error_pubsub()

    # There shouldn't be any errors yet.
-    assert len(ray.errors()) == 0
+    errors = get_error_message(p, 1, 2)
+    assert len(errors) == 0

    error_string1 = "error_string1"
    error_string2 = "error_string2"
@@ -31,13 +34,11 @@ def test_error_isolation(call_ray_start):
        ray.get(f.remote())

    # Wait for the error to appear in Redis.
-    while len(ray.errors()) != 1:
-        time.sleep(0.1)
-        print("Waiting for error to appear.")
+    errors = get_error_message(p, 1)

    # Make sure we got the error.
-    assert len(ray.errors()) == 1
-    assert error_string1 in ray.errors()[0]["message"]
+    assert len(errors) == 1
+    assert error_string1 in errors[0].error_message

    # Start another driver and make sure that it does not receive this
    # error. Make the other driver throw an error, and make sure it
@@ -45,11 +46,13 @@ def test_error_isolation(call_ray_start):
    driver_script = """
 import ray
 import time
+from ray.test_utils import (init_error_pubsub, get_error_message)

 ray.init(address="{}")
-
+p = init_error_pubsub()
 time.sleep(1)
-assert len(ray.errors()) == 0
+errors = get_error_message(p, 1, 2)
+assert len(errors) == 0

@ray.remote
 def f():
@@ -60,12 +63,10 @@ try:
 except Exception as e:
    pass

-while len(ray.errors()) != 1:
-    print(len(ray.errors()))
-    time.sleep(0.1)
-assert len(ray.errors()) == 1
+errors = get_error_message(p, 1)
+assert len(errors) == 1

-assert "{}" in ray.errors()[0]["message"]
+assert "{}" in errors[0].error_message

 print("success")
 """.format(address, error_string2, error_string2)
@@ -76,8 +77,9 @@ print("success")

    # Make sure that the other error message doesn't show up for this
    # driver.
-    assert len(ray.errors()) == 1
-    assert error_string1 in ray.errors()[0]["message"]
+    errors = get_error_message(p, 1)
+    assert len(errors) == 1
+    p.close()


 def test_remote_function_isolation(call_ray_start):
@@ -7,8 +7,8 @@ import time

 import ray
 from ray.cluster_utils import Cluster
-from ray.test_utils import flat_errors
 import ray.ray_constants as ray_constants
+from ray.test_utils import get_error_message


@pytest.fixture(params=[1, 4])
@@ -203,12 +203,12 @@ def test_multiple_recursive(ray_start_reconstruction):
    assert cluster.remaining_processes_alive()


-def wait_for_errors(error_check):
+def wait_for_errors(p, error_check):
    # Wait for errors from all the nondeterministic tasks.
    errors = []
    time_left = 100
    while time_left > 0:
-        errors = flat_errors()
+        errors.extend(get_error_message(p, 1))
        if error_check(errors):
            break
        time_left -= 1
@@ -223,7 +223,8 @@ def wait_for_errors(error_check):
@pytest.mark.skipif(
    os.environ.get("RAY_USE_NEW_GCS") == "on",
    reason="Failing with new GCS API on Linux.")
-def test_nondeterministic_task(ray_start_reconstruction):
+def test_nondeterministic_task(ray_start_reconstruction, error_pubsub):
+    p = error_pubsub
    plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
    # Define the size of one task's return argument so that the combined
    # sum of all objects' sizes is at least twice the plasma stores'
@@ -280,9 +281,9 @@ def test_nondeterministic_task(ray_start_reconstruction):
            min_errors = 1
        return len(errors) >= min_errors

-    errors = wait_for_errors(error_check)
+    errors = wait_for_errors(p, error_check)
    # Make sure all the errors have the correct type.
-    assert all(error["type"] == ray_constants.HASH_MISMATCH_PUSH_ERROR
+    assert all(error.type == ray_constants.HASH_MISMATCH_PUSH_ERROR
               for error in errors)

    assert cluster.remaining_processes_alive()
@@ -293,7 +294,8 @@ def test_nondeterministic_task(ray_start_reconstruction):
    reason="Failing with new GCS API on Linux.")
@pytest.mark.parametrize(
    "ray_start_object_store_memory", [10**9], indirect=True)
-def test_driver_put_errors(ray_start_object_store_memory):
+def test_driver_put_errors(ray_start_object_store_memory, error_pubsub):
+    p = error_pubsub
    plasma_store_memory = ray_start_object_store_memory
    # Define the size of one task's return argument so that the combined
    # sum of all objects' sizes is at least twice the plasma stores'
@@ -333,10 +335,11 @@ def test_driver_put_errors(ray_start_object_store_memory):
    def error_check(errors):
        return len(errors) > 1

-    errors = wait_for_errors(error_check)
-    assert all(error["type"] == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR
-               or "ray.exceptions.UnreconstructableError" in error["message"]
-               for error in errors)
+    errors = wait_for_errors(p, error_check)
+    assert all(
+        error.type == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR
+        or "ray.exceptions.UnreconstructableError" in error.error_messages
+        for error in errors)


 # NOTE(swang): This test tries to launch 1000 workers and breaks.
@@ -117,10 +117,11 @@ def push_error_to_driver_through_redis(redis_client,
    # of through the raylet.
    error_data = ray.gcs_utils.construct_error_message(job_id, error_type,
                                                       message, time.time())
-    redis_client.execute_command(
-        "RAY.TABLE_APPEND", ray.gcs_utils.TablePrefix.Value("ERROR_INFO"),
-        ray.gcs_utils.TablePubsub.Value("ERROR_INFO_PUBSUB"), job_id.binary(),
-        error_data)
+    pubsub_msg = ray.gcs_utils.PubSubMessage()
+    pubsub_msg.id = job_id.hex()
+    pubsub_msg.data = error_data
+    redis_client.publish("ERROR_INFO:" + job_id.hex(),
+                         pubsub_msg.SerializeAsString())


 def is_cython(obj):
@@ -1095,16 +1095,11 @@ def listen_error_messages_raylet(worker, task_error_queue, threads_stopped):

    # Really we should just subscribe to the errors for this specific job.
    # However, currently all errors seem to be published on the same channel.
-    error_pubsub_channel = str(
-        ray.gcs_utils.TablePubsub.Value("ERROR_INFO_PUBSUB")).encode("ascii")
-    worker.error_message_pubsub_client.subscribe(error_pubsub_channel)
-    # worker.error_message_pubsub_client.psubscribe("*")
+    error_pubsub_channel = ray.gcs_utils.RAY_ERROR_PUBSUB_PATTERN
+    worker.error_message_pubsub_client.psubscribe(error_pubsub_channel)

    try:
        # Get the errors that occurred before the call to subscribe.
-        error_messages = ray.errors()
-        for error_message in error_messages:
-            logger.error(error_message)

        while True:
            # Exit if we received a signal that we should stop.
@@ -1115,10 +1110,9 @@ def listen_error_messages_raylet(worker, task_error_queue, threads_stopped):
            if msg is None:
                threads_stopped.wait(timeout=0.01)
                continue
-            gcs_entry = ray.gcs_utils.GcsEntry.FromString(msg["data"])
-            assert len(gcs_entry.entries) == 1
+            pubsub_msg = ray.gcs_utils.PubSubMessage.FromString(msg["data"])
            error_data = ray.gcs_utils.ErrorTableData.FromString(
-                gcs_entry.entries[0])
+                pubsub_msg.data)
            job_id = error_data.job_id
            if job_id not in [
                    worker.current_job_id.binary(),