[api] API deprecations and cleanups for 1.0 (internal_config and Checkpointable actor) (#10333)

* remove * internal config updates, remove Checkpointable * Lower object timeout default * remove json * Fix flaky test * Fix unit test
2026-06-28 00:44:50 +08:00 · 2020-08-27 10:19:53 -07:00
parent 0aec4cbccb
commit f75dfd60a3
56 changed files with 239 additions and 1267 deletions
@@ -2,8 +2,6 @@ import inspect
 import logging
 import weakref

-from abc import ABCMeta, abstractmethod
-from collections import namedtuple
 import ray.ray_constants as ray_constants
 import ray._raylet
 import ray.signature as signature
@@ -854,11 +852,6 @@ def modify_class(cls):
            "classes. In Python 2, you must declare the class with "
            "'class ClassName(object):' instead of 'class ClassName:'.")

-    if issubclass(cls, Checkpointable) and inspect.isabstract(cls):
-        raise TypeError(
-            "A checkpointable actor class should implement all abstract "
-            "methods in the `Checkpointable` interface.")
-
    # Modify the class to have an additional method that will be used for
    # terminating the worker.
    class Class(cls):
@@ -869,20 +862,6 @@ def modify_class(cls):
            if worker.mode != ray.LOCAL_MODE:
                ray.actor.exit_actor()

-        def __ray_checkpoint__(self):
-            """Save a checkpoint.
-
-            This task saves the current state of the actor, the current task
-            frontier according to the raylet, and the checkpoint index
-            (number of tasks executed so far).
-            """
-            worker = ray.worker.global_worker
-            if not isinstance(self, ray.actor.Checkpointable):
-                raise TypeError(
-                    "__ray_checkpoint__.remote() may only be called on actors "
-                    "that implement ray.actor.Checkpointable")
-            return worker._save_actor_checkpoint()
-
    Class.__module__ = cls.__module__
    Class.__name__ = cls.__name__

@@ -951,128 +930,3 @@ def exit_actor():
        assert False, "This process should have terminated."
    else:
        raise TypeError("exit_actor called on a non-actor worker.")
-
-
-CheckpointContext = namedtuple(
-    "CheckpointContext",
-    [
-        # Actor's ID.
-        "actor_id",
-        # Number of tasks executed since last checkpoint.
-        "num_tasks_since_last_checkpoint",
-        # Time elapsed since last checkpoint, in milliseconds.
-        "time_elapsed_ms_since_last_checkpoint",
-    ],
-)
-"""A namedtuple that contains information about actor's last checkpoint."""
-
-Checkpoint = namedtuple(
-    "Checkpoint",
-    [
-        # ID of this checkpoint.
-        "checkpoint_id",
-        # The timestamp at which this checkpoint was saved,
-        # represented as milliseconds elapsed since Unix epoch.
-        "timestamp",
-    ],
-)
-"""A namedtuple that represents a checkpoint."""
-
-
-class Checkpointable(metaclass=ABCMeta):
-    """An interface that indicates an actor can be checkpointed."""
-
-    @abstractmethod
-    def should_checkpoint(self, checkpoint_context):
-        """Whether this actor needs to be checkpointed.
-
-        This method will be called after every task. You should implement this
-        callback to decide whether this actor needs to be checkpointed at this
-        time, based on the checkpoint context, or any other factors.
-
-        Args:
-            checkpoint_context: A namedtuple that contains info about last
-                checkpoint.
-
-        Returns:
-            A boolean value that indicates whether this actor needs to be
-            checkpointed.
-        """
-        pass
-
-    @abstractmethod
-    def save_checkpoint(self, actor_id, checkpoint_id):
-        """Save a checkpoint to persistent storage.
-
-        If `should_checkpoint` returns true, this method will be called. You
-        should implement this callback to save actor's checkpoint and the given
-        checkpoint id to persistent storage.
-
-        Args:
-            actor_id: Actor's ID.
-            checkpoint_id: ID of this checkpoint. You should save it together
-                with actor's checkpoint data. And it will be used by the
-                `load_checkpoint` method.
-        Returns:
-            None.
-        """
-        pass
-
-    @abstractmethod
-    def load_checkpoint(self, actor_id, available_checkpoints):
-        """Load actor's previous checkpoint, and restore actor's state.
-
-        This method will be called when an actor is restarted, after
-        actor's constructor.
-        If the actor needs to restore from previous checkpoint, this function
-        should restore actor's state and return the checkpoint ID. Otherwise,
-        it should do nothing and return None.
-        Note, this method must return one of the checkpoint IDs in the
-        `available_checkpoints` list, or None. Otherwise, an exception will be
-        raised.
-
-        Args:
-            actor_id: Actor's ID.
-            available_checkpoints: A list of `Checkpoint` namedtuples that
-                contains all available checkpoint IDs and their timestamps,
-                sorted by timestamp in descending order.
-        Returns:
-            The ID of the checkpoint from which the actor was resumed, or None
-            if the actor should restart from the beginning.
-        """
-        pass
-
-    @abstractmethod
-    def checkpoint_expired(self, actor_id, checkpoint_id):
-        """Delete an expired checkpoint.
-
-        This method will be called when an checkpoint is expired. You should
-        implement this method to delete your application checkpoint data.
-        Note, the maximum number of checkpoints kept in the backend can be
-        configured at `RayConfig.num_actor_checkpoints_to_keep`.
-
-        Args:
-            actor_id: ID of the actor.
-            checkpoint_id: ID of the checkpoint that has expired.
-        Returns:
-            None.
-        """
-        pass
-
-
-def get_checkpoints_for_actor(actor_id):
-    """Get the available checkpoints for the given actor ID, return a list
-    sorted by checkpoint timestamp in descending order.
-    """
-    checkpoint_info = ray.state.state.actor_checkpoint_info(actor_id)
-    if checkpoint_info is None:
-        return []
-    checkpoints = [
-        Checkpoint(checkpoint_id, timestamp) for checkpoint_id, timestamp in
-        zip(checkpoint_info["CheckpointIds"], checkpoint_info["Timestamps"])
-    ]
-    return sorted(
-        checkpoints,
-        key=lambda checkpoint: checkpoint.timestamp,
-        reverse=True,
-    )
@@ -1,4 +1,3 @@
-import json
 import logging
 import time

@@ -80,9 +79,6 @@ class Cluster:
            "min_worker_port": 0,
            "max_worker_port": 0,
        }
-        if "_internal_config" in node_args:
-            node_args["_internal_config"] = json.loads(
-                node_args["_internal_config"])
        ray_params = ray.parameter.RayParams(**node_args)
        ray_params.update_if_absent(**default_kwargs)
        if self.head_node is None:
@@ -544,43 +544,13 @@ class FunctionActorManager:
        """

        def actor_method_executor(actor, *args, **kwargs):
-            # Update the actor's task counter to reflect the task we're about
-            # to execute.
-            self._worker.actor_task_counter += 1
-
-            # Execute the assigned method and save a checkpoint if necessary.
-            try:
-                is_bound = (is_class_method(method)
-                            or is_static_method(type(actor), method_name))
-                if is_bound:
-                    method_returns = method(*args, **kwargs)
-                else:
-                    method_returns = method(actor, *args, **kwargs)
-            except Exception as e:
-                # Save the checkpoint before allowing the method exception
-                # to be thrown, but don't save the checkpoint for actor
-                # creation task.
-                if (isinstance(actor, ray.actor.Checkpointable)
-                        and self._worker.actor_task_counter != 1):
-                    self._save_and_log_checkpoint(actor)
-                raise e
+            # Execute the assigned method.
+            is_bound = (is_class_method(method)
+                        or is_static_method(type(actor), method_name))
+            if is_bound:
+                return method(*args, **kwargs)
            else:
-                # Handle any checkpointing operations before storing the
-                # method's return values.
-                # NOTE(swang): If method_returns is a pointer to the actor's
-                # state and the checkpointing operations can modify the return
-                # values if they mutate the actor's state. Is this okay?
-                if isinstance(actor, ray.actor.Checkpointable):
-                    # If this is the first task to execute on the actor, try to
-                    # resume from a checkpoint.
-                    if self._worker.actor_task_counter == 1:
-                        if actor_imported:
-                            self._restore_and_log_checkpoint(actor)
-                    else:
-                        # Save the checkpoint before returning the method's
-                        # return values.
-                        self._save_and_log_checkpoint(actor)
-                return method_returns
+                return method(actor, *args, **kwargs)

        # Set method_name and method as attributes to the executor clusore
        # so we can make decision based on these attributes in task executor.
@@ -591,86 +561,3 @@ class FunctionActorManager:
        actor_method_executor.method = method

        return actor_method_executor
-
-    def _save_and_log_checkpoint(self, actor):
-        """Save an actor checkpoint if necessary and log any errors.
-
-        Args:
-            actor: The actor to checkpoint.
-
-        Returns:
-            The result of the actor's user-defined `save_checkpoint` method.
-        """
-        actor_id = self._worker.actor_id
-        checkpoint_info = self._worker.actor_checkpoint_info[actor_id]
-        checkpoint_info.num_tasks_since_last_checkpoint += 1
-        now = int(1000 * time.time())
-        checkpoint_context = ray.actor.CheckpointContext(
-            actor_id, checkpoint_info.num_tasks_since_last_checkpoint,
-            now - checkpoint_info.last_checkpoint_timestamp)
-        # If we should take a checkpoint, notify raylet to prepare a checkpoint
-        # and then call `save_checkpoint`.
-        if actor.should_checkpoint(checkpoint_context):
-            try:
-                now = int(1000 * time.time())
-                checkpoint_id = (
-                    self._worker.core_worker.prepare_actor_checkpoint(actor_id)
-                )
-                checkpoint_info.checkpoint_ids.append(checkpoint_id)
-                actor.save_checkpoint(actor_id, checkpoint_id)
-                if (len(checkpoint_info.checkpoint_ids) >
-                        ray._config.num_actor_checkpoints_to_keep()):
-                    actor.checkpoint_expired(
-                        actor_id,
-                        checkpoint_info.checkpoint_ids.pop(0),
-                    )
-                checkpoint_info.num_tasks_since_last_checkpoint = 0
-                checkpoint_info.last_checkpoint_timestamp = now
-            except Exception:
-                # Checkpoint save or reload failed. Notify the driver.
-                traceback_str = ray.utils.format_error_message(
-                    traceback.format_exc())
-                ray.utils.push_error_to_driver(
-                    self._worker,
-                    ray_constants.CHECKPOINT_PUSH_ERROR,
-                    traceback_str,
-                    job_id=self._worker.current_job_id)
-
-    def _restore_and_log_checkpoint(self, actor):
-        """Restore an actor from a checkpoint if available and log any errors.
-
-        This should only be called on workers that have just executed an actor
-        creation task.
-
-        Args:
-            actor: The actor to restore from a checkpoint.
-        """
-        actor_id = self._worker.actor_id
-        try:
-            checkpoints = ray.actor.get_checkpoints_for_actor(actor_id)
-            if len(checkpoints) > 0:
-                # If we found previously saved checkpoints for this actor,
-                # call the `load_checkpoint` callback.
-                checkpoint_id = actor.load_checkpoint(actor_id, checkpoints)
-                if checkpoint_id is not None:
-                    # Check that the returned checkpoint id is in the
-                    # `available_checkpoints` list.
-                    msg = (
-                        "`load_checkpoint` must return a checkpoint id that " +
-                        "exists in the `available_checkpoints` list, or None.")
-                    assert any(checkpoint_id == checkpoint.checkpoint_id
-                               for checkpoint in checkpoints), msg
-                    # Notify raylet that this actor has been resumed from
-                    # a checkpoint.
-                    (self._worker.core_worker.
-                     notify_actor_resumed_from_checkpoint(
-                         actor_id, checkpoint_id))
-        except Exception:
-            # Checkpoint save or reload failed. Notify the driver.
-            traceback_str = ray.utils.format_error_message(
-                traceback.format_exc())
-            ray.utils.push_error_to_driver(
-                self._worker,
-                ray_constants.CHECKPOINT_PUSH_ERROR,
-                traceback_str,
-                job_id=self._worker.current_job_id)
@@ -21,52 +21,28 @@ cdef extern from "ray/common/ray_config.h" nogil:

        uint64_t num_heartbeats_warning() const

-        int64_t initial_reconstruction_timeout_milliseconds() const
+        int64_t object_timeout_milliseconds() const

        int64_t get_timeout_milliseconds() const

-        uint64_t max_lineage_size() const
-
        int64_t worker_get_request_size() const

        int64_t worker_fetch_request_size() const

-        int64_t actor_max_dummy_objects() const
-
        int64_t raylet_client_num_connect_attempts() const

        int64_t raylet_client_connect_timeout_milliseconds() const

        int64_t raylet_fetch_timeout_milliseconds() const

-        int64_t raylet_reconstruction_timeout_milliseconds() const
-
-        int64_t max_num_to_reconstruct() const
-
-        int64_t raylet_fetch_request_size() const
-
        int64_t kill_worker_timeout_milliseconds() const

        int64_t worker_register_timeout_seconds() const

-        int64_t max_time_for_handler_milliseconds() const
-
-        int64_t max_time_for_loop() const
-
        int64_t redis_db_connect_retries()

        int64_t redis_db_connect_wait_milliseconds() const

-        int64_t plasma_default_release_delay() const
-
-        int64_t L3_cache_size_bytes() const
-
-        int64_t max_tasks_to_spillback() const
-
-        int64_t actor_creation_num_spillbacks_warning() const
-
-        int node_manager_forward_task_retry_timeout_milliseconds() const
-
        int object_manager_pull_timeout_ms() const

        int object_manager_push_timeout_ms() const
@@ -79,10 +55,6 @@ cdef extern from "ray/common/ray_config.h" nogil:

        int num_workers_per_process_java() const

-        int64_t max_task_lease_timeout_ms() const
-
-        uint32_t num_actor_checkpoints_to_keep() const
-
        uint32_t maximum_gcs_deletion_batch_size() const

        int64_t max_direct_call_object_size() const
@@ -26,18 +26,14 @@ cdef class Config:
        return RayConfig.instance().num_heartbeats_warning()

    @staticmethod
-    def initial_reconstruction_timeout_milliseconds():
+    def object_timeout_milliseconds():
        return (RayConfig.instance()
-                .initial_reconstruction_timeout_milliseconds())
+                .object_timeout_milliseconds())

    @staticmethod
    def get_timeout_milliseconds():
        return RayConfig.instance().get_timeout_milliseconds()

-    @staticmethod
-    def max_lineage_size():
-        return RayConfig.instance().max_lineage_size()
-
    @staticmethod
    def worker_get_request_size():
        return RayConfig.instance().worker_get_request_size()
@@ -46,10 +42,6 @@ cdef class Config:
    def worker_fetch_request_size():
        return RayConfig.instance().worker_fetch_request_size()

-    @staticmethod
-    def actor_max_dummy_objects():
-        return RayConfig.instance().actor_max_dummy_objects()
-
    @staticmethod
    def raylet_client_num_connect_attempts():
        return RayConfig.instance().raylet_client_num_connect_attempts()
@@ -64,19 +56,6 @@ cdef class Config:
        return (RayConfig.instance()
                .raylet_fetch_timeout_milliseconds())

-    @staticmethod
-    def raylet_reconstruction_timeout_milliseconds():
-        return (RayConfig.instance()
-                .raylet_reconstruction_timeout_milliseconds())
-
-    @staticmethod
-    def max_num_to_reconstruct():
-        return RayConfig.instance().max_num_to_reconstruct()
-
-    @staticmethod
-    def raylet_fetch_request_size():
-        return RayConfig.instance().raylet_fetch_request_size()
-
    @staticmethod
    def kill_worker_timeout_milliseconds():
        return RayConfig.instance().kill_worker_timeout_milliseconds()
@@ -85,14 +64,6 @@ cdef class Config:
    def worker_register_timeout_seconds():
        return RayConfig.instance().worker_register_timeout_seconds()

-    @staticmethod
-    def max_time_for_handler_milliseconds():
-        return RayConfig.instance().max_time_for_handler_milliseconds()
-
-    @staticmethod
-    def max_time_for_loop():
-        return RayConfig.instance().max_time_for_loop()
-
    @staticmethod
    def redis_db_connect_retries():
        return RayConfig.instance().redis_db_connect_retries()
@@ -101,27 +72,6 @@ cdef class Config:
    def redis_db_connect_wait_milliseconds():
        return RayConfig.instance().redis_db_connect_wait_milliseconds()

-    @staticmethod
-    def plasma_default_release_delay():
-        return RayConfig.instance().plasma_default_release_delay()
-
-    @staticmethod
-    def L3_cache_size_bytes():
-        return RayConfig.instance().L3_cache_size_bytes()
-
-    @staticmethod
-    def max_tasks_to_spillback():
-        return RayConfig.instance().max_tasks_to_spillback()
-
-    @staticmethod
-    def actor_creation_num_spillbacks_warning():
-        return RayConfig.instance().actor_creation_num_spillbacks_warning()
-
-    @staticmethod
-    def node_manager_forward_task_retry_timeout_milliseconds():
-        return (RayConfig.instance()
-                .node_manager_forward_task_retry_timeout_milliseconds())
-
    @staticmethod
    def object_manager_pull_timeout_ms():
        return RayConfig.instance().object_manager_pull_timeout_ms()
@@ -146,14 +96,6 @@ cdef class Config:
    def num_workers_per_process_java():
        return RayConfig.instance().num_workers_per_process_java()

-    @staticmethod
-    def max_task_lease_timeout_ms():
-        return RayConfig.instance().max_task_lease_timeout_ms()
-
-    @staticmethod
-    def num_actor_checkpoints_to_keep():
-        return RayConfig.instance().num_actor_checkpoints_to_keep()
-
    @staticmethod
    def maximum_gcs_deletion_batch_size():
        return RayConfig.instance().maximum_gcs_deletion_batch_size()
@@ -93,9 +93,9 @@ class Node:
                "The raylet IP address should only be different than the node "
                "IP address when connecting to an existing raylet; i.e., when "
                "head=False and connect_only=True.")
-        if ray_params._internal_config and len(
-                ray_params._internal_config) > 0 and (not head
-                                                      and not connect_only):
+        if ray_params._system_config and len(
+                ray_params._system_config) > 0 and (not head
+                                                    and not connect_only):
            raise ValueError(
                "Internal config parameters can only be set on the head node.")

@@ -124,7 +124,7 @@ class Node:
        self._localhost = socket.gethostbyname("localhost")
        self._ray_params = ray_params
        self._redis_address = ray_params.redis_address
-        self._config = ray_params._internal_config or {}
+        self._config = ray_params._system_config or {}

        # Enable Plasma Store as a thread by default.
        if "plasma_store_as_thread" not in self._config:
@@ -91,8 +91,9 @@ class RayParams:
        metrics_agent_port(int): The port to bind metrics agent.
        metrics_export_port(int): The port at which metrics are exposed
            through a Prometheus endpoint.
-        _internal_config (str): JSON configuration for overriding
-            RayConfig defaults. For testing purposes ONLY.
+        _system_config (dict): Configuration for overriding RayConfig
+            defaults. Used to set system configuration and for experimental Ray
+            core feature flags.
        lru_evict (bool): Enable LRU eviction if space is needed.
        enable_object_reconstruction (bool): Enable plasma reconstruction on
            failure.
@@ -141,7 +142,7 @@ class RayParams:
                 java_worker_options=None,
                 load_code_from_local=False,
                 start_initial_python_workers_for_first_job=False,
-                 _internal_config=None,
+                 _system_config=None,
                 enable_object_reconstruction=False,
                 metrics_agent_port=None,
                 metrics_export_port=None,
@@ -188,7 +189,7 @@ class RayParams:
        self.metrics_export_port = metrics_export_port
        self.start_initial_python_workers_for_first_job = (
            start_initial_python_workers_for_first_job)
-        self._internal_config = _internal_config
+        self._system_config = _system_config
        self._lru_evict = lru_evict
        self._enable_object_reconstruction = enable_object_reconstruction
        self.object_spilling_config = object_spilling_config
@@ -197,26 +198,27 @@ class RayParams:
        # Set the internal config options for LRU eviction.
        if lru_evict:
            # Turn off object pinning.
-            if self._internal_config is None:
-                self._internal_config = dict()
-            if self._internal_config.get("object_pinning_enabled", False):
+            if self._system_config is None:
+                self._system_config = dict()
+            if self._system_config.get("object_pinning_enabled", False):
                raise Exception(
                    "Object pinning cannot be enabled if using LRU eviction.")
-            self._internal_config["object_pinning_enabled"] = False
-            self._internal_config["object_store_full_max_retries"] = -1
-            self._internal_config["free_objects_period_milliseconds"] = 1000
+            self._system_config["object_pinning_enabled"] = False
+            self._system_config["object_store_full_max_retries"] = -1
+            self._system_config["free_objects_period_milliseconds"] = 1000

        # Set the internal config options for object reconstruction.
        if enable_object_reconstruction:
            # Turn off object pinning.
-            if self._internal_config is None:
-                self._internal_config = dict()
+            if self._system_config is None:
+                self._system_config = dict()
            if lru_evict:
                raise Exception(
                    "Object reconstruction cannot be enabled if using LRU "
                    "eviction.")
-            self._internal_config["lineage_pinning_enabled"] = True
-            self._internal_config["free_objects_period_milliseconds"] = -1
+            print(self._system_config)
+            self._system_config["lineage_pinning_enabled"] = True
+            self._system_config["free_objects_period_milliseconds"] = -1

    def update(self, **kwargs):
        """Update the settings according to the keyword arguments.
@@ -1,7 +1,6 @@
 """This is the script for `ray microbenchmark`."""

 import asyncio
-import json
 import logging
 import os
 import time
@@ -110,10 +109,7 @@ def main():

    print("Tip: set TESTS_TO_RUN='pattern' to run a subset of benchmarks")

-    ray.init(
-        _internal_config=json.dumps({
-            "put_small_object_in_memory_store": True
-        }))
+    ray.init(_system_config={"put_small_object_in_memory_store": True})

    value = ray.put(0)

@@ -138,10 +134,7 @@ def main():
    timeit("multi client put calls", put_multi_small, 1000)

    ray.shutdown()
-    ray.init(
-        _internal_config=json.dumps({
-            "put_small_object_in_memory_store": False
-        }))
+    ray.init(_system_config={"put_small_object_in_memory_store": False})

    value = ray.put(0)
    arr = np.zeros(100 * 1024 * 1024, dtype=np.int64)
@@ -358,10 +358,10 @@ def dashboard(cluster_config_file, cluster_name, port, remote_port):
    type=str,
    help="Overwrite the options to start Java workers.")
@click.option(
-    "--internal-config",
+    "--system-config",
    default=None,
    type=json.loads,
-    help="Do NOT use this. This is for debugging/development purposes ONLY.")
+    help="Override system configuration defaults.")
@click.option(
    "--load-code-from-local",
    is_flag=True,
@@ -394,9 +394,9 @@ def start(node_ip_address, redis_address, address, redis_port, port,
          dashboard_port, block, plasma_directory, huge_pages,
          autoscaling_config, no_redirect_worker_output, no_redirect_output,
          plasma_store_socket_name, raylet_socket_name, temp_dir, include_java,
-          java_worker_options, load_code_from_local, internal_config,
-          lru_evict, enable_object_reconstruction, metrics_export_port,
-          log_new_style, log_color, verbose):
+          java_worker_options, load_code_from_local, system_config, lru_evict,
+          enable_object_reconstruction, metrics_export_port, log_new_style,
+          log_color, verbose):
    """Start Ray processes manually on the local machine."""
    cli_logger.old_style = not log_new_style
    cli_logger.color_mode = log_color
@@ -508,7 +508,8 @@ def start(node_ip_address, redis_address, address, redis_port, port,
        dashboard_port=dashboard_port,
        java_worker_options=java_worker_options,
        load_code_from_local=load_code_from_local,
-        _internal_config=internal_config,
+        _system_config=json.loads(system_config)
+        if system_config else system_config,
        lru_evict=lru_evict,
        enable_object_reconstruction=enable_object_reconstruction,
        metrics_export_port=metrics_export_port)
@@ -1,7 +1,6 @@
 import asyncio
 import errno
 import io
-import json
 import fnmatch
 import os
 import subprocess
@@ -282,10 +281,9 @@ def recursive_fnmatch(dirpath, pattern):
    return matches


-def generate_internal_config_map(**kwargs):
-    internal_config = json.dumps(kwargs)
+def generate_system_config_map(**kwargs):
    ray_kwargs = {
-        "_internal_config": internal_config,
+        "_system_config": kwargs,
    }
    return ray_kwargs

@@ -3,7 +3,6 @@ This file defines the common pytest fixtures used in current directory.
 """

 from contextlib import contextmanager
-import json
 import pytest
 import subprocess

@@ -19,22 +18,22 @@ def shutdown_only():
    ray.shutdown()


-def get_default_fixure_internal_config():
-    internal_config = json.dumps({
-        "initial_reconstruction_timeout_milliseconds": 200,
+def get_default_fixure_system_config():
+    system_config = {
+        "object_timeout_milliseconds": 200,
        "num_heartbeats_timeout": 10,
        "object_store_full_max_retries": 3,
        "object_store_full_initial_delay_ms": 100,
-    })
-    return internal_config
+    }
+    return system_config


 def get_default_fixture_ray_kwargs():
-    internal_config = get_default_fixure_internal_config()
+    system_config = get_default_fixure_system_config()
    ray_kwargs = {
        "num_cpus": 1,
        "object_store_memory": 150 * 1024 * 1024,
-        "_internal_config": internal_config,
+        "_system_config": system_config,
    }
    return ray_kwargs

@@ -125,8 +124,8 @@ def _ray_start_cluster(**kwargs):
    cluster = Cluster()
    remote_nodes = []
    for i in range(num_nodes):
-        if i > 0 and "_internal_config" in init_kwargs:
-            del init_kwargs["_internal_config"]
+        if i > 0 and "_system_config" in init_kwargs:
+            del init_kwargs["_system_config"]
        remote_nodes.append(cluster.add_node(**init_kwargs))
        # We assume driver will connect to the head (first node),
        # so ray init will be invoked if do_init is true
@@ -164,10 +163,10 @@ def ray_start_cluster_2_nodes(request):
 def ray_start_object_store_memory(request):
    # Start the Ray processes.
    store_size = request.param
-    internal_config = get_default_fixure_internal_config()
+    system_config = get_default_fixure_system_config()
    init_kwargs = {
        "num_cpus": 1,
-        "_internal_config": internal_config,
+        "_system_config": system_config,
        "object_store_memory": store_size,
    }
    ray.init(**init_kwargs)
@@ -208,12 +207,12 @@ def call_ray_stop_only():

@pytest.fixture()
 def two_node_cluster():
-    internal_config = json.dumps({
-        "initial_reconstruction_timeout_milliseconds": 200,
+    system_config = {
+        "object_timeout_milliseconds": 200,
        "num_heartbeats_timeout": 10,
-    })
+    }
    cluster = ray.cluster_utils.Cluster(
-        head_node_args={"_internal_config": internal_config})
+        head_node_args={"_system_config": system_config})
    for _ in range(2):
        remote_node = cluster.add_node(num_cpus=1)
    ray.init(address=cluster.address)
@@ -1,5 +1,4 @@
 import collections
-import json
 import numpy as np
 import os
 import pytest
@@ -8,94 +7,22 @@ import sys
 import time

 import ray
-import ray.ray_constants as ray_constants
 import ray.test_utils
 import ray.cluster_utils
 from ray.test_utils import (
    wait_for_condition,
    wait_for_pid_to_exit,
-    generate_internal_config_map,
+    generate_system_config_map,
    get_other_nodes,
    SignalActor,
-    get_error_message,
 )

 SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM


-@pytest.fixture
-def ray_checkpointable_actor_cls(request):
-    checkpoint_dir = os.path.join(ray.utils.get_user_temp_dir(),
-                                  "ray_temp_checkpoint_dir") + os.sep
-    if not os.path.isdir(checkpoint_dir):
-        os.mkdir(checkpoint_dir)
-
-    class CheckpointableActor(ray.actor.Checkpointable):
-        def __init__(self):
-            self.value = 0
-            self.resumed_from_checkpoint = False
-            self.checkpoint_dir = checkpoint_dir
-
-        def node_id(self):
-            return ray.worker.global_worker.node.unique_id
-
-        def increase(self):
-            self.value += 1
-            return self.value
-
-        def get(self):
-            return self.value
-
-        def was_resumed_from_checkpoint(self):
-            return self.resumed_from_checkpoint
-
-        def get_pid(self):
-            return os.getpid()
-
-        def should_checkpoint(self, checkpoint_context):
-            # Checkpoint the actor when value is increased to 3.
-            should_checkpoint = self.value == 3
-            return should_checkpoint
-
-        def save_checkpoint(self, actor_id, checkpoint_id):
-            actor_id, checkpoint_id = actor_id.hex(), checkpoint_id.hex()
-            # Save checkpoint into a file.
-            with open(self.checkpoint_dir + actor_id, "a+") as f:
-                print(checkpoint_id, self.value, file=f)
-
-        def load_checkpoint(self, actor_id, available_checkpoints):
-            actor_id = actor_id.hex()
-            filename = self.checkpoint_dir + actor_id
-            # Load checkpoint from the file.
-            if not os.path.isfile(filename):
-                return None
-
-            available_checkpoint_ids = [
-                c.checkpoint_id for c in available_checkpoints
-            ]
-            with open(filename, "r") as f:
-                for line in f:
-                    checkpoint_id, value = line.strip().split(" ")
-                    checkpoint_id = ray.ActorCheckpointID(
-                        ray.utils.hex_to_binary(checkpoint_id))
-                    if checkpoint_id in available_checkpoint_ids:
-                        self.value = int(value)
-                        self.resumed_from_checkpoint = True
-                        return checkpoint_id
-                return None
-
-        def checkpoint_expired(self, actor_id, checkpoint_id):
-            pass
-
-    return CheckpointableActor
-
-
@pytest.fixture
 def ray_init_with_task_retry_delay():
-    address = ray.init(
-        _internal_config=json.dumps({
-            "task_retry_delay_ms": 100
-        }))
+    address = ray.init(_system_config={"task_retry_delay_ms": 100})
    yield address
    ray.shutdown()

@@ -284,15 +211,15 @@ def test_actor_restart_with_retry(ray_init_with_task_retry_delay):


 def test_actor_restart_on_node_failure(ray_start_cluster):
-    config = json.dumps({
+    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
-        "initial_reconstruction_timeout_milliseconds": 1000,
+        "object_timeout_milliseconds": 1000,
        "task_retry_delay_ms": 100,
-    })
+    }
    cluster = ray_start_cluster
    # Head node with no resources.
-    cluster.add_node(num_cpus=0, _internal_config=config)
+    cluster.add_node(num_cpus=0, _system_config=config)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

@@ -441,15 +368,14 @@ def test_caller_task_reconstruction(ray_start_regular):
    assert ray.get(RetryableTask.remote(remote_actor)) == 3


-# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for
+# NOTE(hchen): we set object_timeout_milliseconds to 1s for
 # this test. Because if this value is too small, suprious task reconstruction
 # may happen and cause the test fauilure. If the value is too large, this test
 # could be very slow. We can remove this once we support dynamic timeout.
@pytest.mark.parametrize(
    "ray_start_cluster_head", [
-        generate_internal_config_map(
-            initial_reconstruction_timeout_milliseconds=1000,
-            num_heartbeats_timeout=10)
+        generate_system_config_map(
+            object_timeout_milliseconds=1000, num_heartbeats_timeout=10)
    ],
    indirect=True)
 def test_multiple_actor_restart(ray_start_cluster_head):
@@ -520,287 +446,6 @@ def kill_actor(actor):
    wait_for_pid_to_exit(pid)


-@pytest.mark.skip(reason="TODO: Actor checkpointing")
-def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
-    """Test actor checkpointing and restoring from a checkpoint."""
-    actor = ray.remote(max_restarts=2)(ray_checkpointable_actor_cls).remote()
-    # Call increase 3 times, triggering a checkpoint.
-    expected = 0
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Assert that the actor wasn't resumed from a checkpoint.
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-    # Kill actor process.
-    kill_actor(actor)
-    # Assert that the actor was resumed from a checkpoint and its value is
-    # still correct.
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
-
-    # Submit some more tasks. These should get replayed since they happen after
-    # the checkpoint.
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Kill actor again and check that restart still works after the
-    # actor resuming from a checkpoint.
-    kill_actor(actor)
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
-
-
-@pytest.mark.skip(reason="TODO: Actor checkpointing")
-def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
-    """Test checkpointing of a remote actor through method invocation."""
-
-    # Define a class that exposes a method to save checkpoints.
-    class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
-        def __init__(self):
-            super(RemoteCheckpointableActor, self).__init__()
-            self._should_checkpoint = False
-
-        def checkpoint(self):
-            self._should_checkpoint = True
-
-        def should_checkpoint(self, checkpoint_context):
-            should_checkpoint = self._should_checkpoint
-            self._should_checkpoint = False
-            return should_checkpoint
-
-    cls = ray.remote(max_restarts=2)(RemoteCheckpointableActor)
-    actor = cls.remote()
-    # Call increase 3 times.
-    expected = 0
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Call a checkpoint task.
-    actor.checkpoint.remote()
-    # Assert that the actor wasn't resumed from a checkpoint.
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-    # Kill actor process.
-    kill_actor(actor)
-    # Assert that the actor was resumed from a checkpoint and its value is
-    # still correct.
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
-
-    # Submit some more tasks. These should get replayed since they happen after
-    # the checkpoint.
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Kill actor again and check that restart still works after the
-    # actor resuming from a checkpoint.
-    kill_actor(actor)
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
-
-
-@pytest.mark.skip(reason="TODO: Actor checkpointing")
-def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
-                                       ray_checkpointable_actor_cls):
-    """Test actor checkpointing on a remote node."""
-    # Place the actor on the remote node.
-    cluster = ray_start_cluster_2_nodes
-    remote_node = list(cluster.worker_nodes)
-    actor_cls = ray.remote(max_restarts=1)(ray_checkpointable_actor_cls)
-    actor = actor_cls.remote()
-    while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id):
-        actor = actor_cls.remote()
-
-    # Call increase several times.
-    expected = 0
-    for _ in range(6):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Assert that the actor wasn't resumed from a checkpoint.
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-    # Kill actor process.
-    cluster.remove_node(remote_node[0])
-    # Assert that the actor was resumed from a checkpoint and its value is
-    # still correct.
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
-
-
-@pytest.mark.skip(reason="TODO: Actor checkpointing")
-def test_checkpointing_save_exception(ray_start_regular, error_pubsub,
-                                      ray_checkpointable_actor_cls):
-    """Test actor can still be recovered if checkpoints fail to complete."""
-
-    p = error_pubsub
-
-    @ray.remote(max_restarts=2)
-    class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
-        def save_checkpoint(self, actor_id, checkpoint_context):
-            raise Exception("Intentional error saving checkpoint.")
-
-    actor = RemoteCheckpointableActor.remote()
-    # Call increase 3 times, triggering a checkpoint that will fail.
-    expected = 0
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Assert that the actor wasn't resumed from a checkpoint.
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-    # Kill actor process.
-    kill_actor(actor)
-    # Assert that the actor still wasn't resumed from a checkpoint and its
-    # value is still correct.
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-
-    # Submit some more tasks. These should get replayed since they happen after
-    # the checkpoint.
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Kill actor again, and check that restart still works and the actor
-    # wasn't resumed from a checkpoint.
-    kill_actor(actor)
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-
-    # Check that the checkpoint error was pushed to the driver.
-    errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
-    assert len(errors) == 1
-    assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
-
-
-@pytest.mark.skip(reason="TODO: Actor checkpointing")
-def test_checkpointing_load_exception(ray_start_regular, error_pubsub,
-                                      ray_checkpointable_actor_cls):
-    """Test actor can still be recovered if checkpoints fail to load."""
-
-    p = error_pubsub
-
-    @ray.remote(max_restarts=2)
-    class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
-        def load_checkpoint(self, actor_id, checkpoints):
-            raise Exception("Intentional error loading checkpoint.")
-
-    actor = RemoteCheckpointableActor.remote()
-    # Call increase 3 times, triggering a checkpoint that will succeed.
-    expected = 0
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Assert that the actor wasn't resumed from a checkpoint because loading
-    # it failed.
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-    # Kill actor process.
-    kill_actor(actor)
-    # Assert that the actor still wasn't resumed from a checkpoint and its
-    # value is still correct.
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-
-    # Submit some more tasks. These should get replayed since they happen after
-    # the checkpoint.
-    for _ in range(3):
-        ray.get(actor.increase.remote())
-        expected += 1
-    # Kill actor again, and check that restart still works and the actor
-    # wasn't resumed from a checkpoint.
-    kill_actor(actor)
-    assert ray.get(actor.get.remote()) == expected
-    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
-
-    # Check that the checkpoint error was pushed to the driver.
-    errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR)
-    assert len(errors) == 1
-    assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
-
-
-@pytest.mark.parametrize(
-    "ray_start_regular",
-    # This overwrite currently isn't effective,
-    # see https://github.com/ray-project/ray/issues/3926.
-    [generate_internal_config_map(num_actor_checkpoints_to_keep=20)],
-    indirect=True,
-)
-def test_deleting_actor_checkpoint(ray_start_regular):
-    """Test deleting old actor checkpoints."""
-
-    @ray.remote
-    class CheckpointableActor(ray.actor.Checkpointable):
-        def __init__(self):
-            self.checkpoint_ids = []
-
-        def get_checkpoint_ids(self):
-            return self.checkpoint_ids
-
-        def should_checkpoint(self, checkpoint_context):
-            # Save checkpoints after every task
-            return True
-
-        def save_checkpoint(self, actor_id, checkpoint_id):
-            self.checkpoint_ids.append(checkpoint_id)
-            pass
-
-        def load_checkpoint(self, actor_id, available_checkpoints):
-            pass
-
-        def checkpoint_expired(self, actor_id, checkpoint_id):
-            assert checkpoint_id == self.checkpoint_ids[0]
-            del self.checkpoint_ids[0]
-
-    actor = CheckpointableActor.remote()
-    for i in range(19):
-        assert len(ray.get(actor.get_checkpoint_ids.remote())) == i + 1
-    for _ in range(20):
-        assert len(ray.get(actor.get_checkpoint_ids.remote())) == 20
-
-
-def test_bad_checkpointable_actor_class():
-    """Test error raised if an actor class doesn't implement all abstract
-    methods in the Checkpointable interface."""
-
-    with pytest.raises(TypeError):
-
-        @ray.remote
-        class BadCheckpointableActor(ray.actor.Checkpointable):
-            def should_checkpoint(self, checkpoint_context):
-                return True
-
-
-def test_init_exception_in_checkpointable_actor(
-        ray_start_regular, error_pubsub, ray_checkpointable_actor_cls):
-    # This test is similar to test_failure.py::test_failed_actor_init.
-    # This test is used to guarantee that checkpointable actor does not
-    # break the same logic.
-    error_message1 = "actor constructor failed"
-    error_message2 = "actor method failed"
-
-    p = error_pubsub
-
-    @ray.remote
-    class CheckpointableFailedActor(ray_checkpointable_actor_cls):
-        def __init__(self):
-            raise Exception(error_message1)
-
-        def fail_method(self):
-            raise Exception(error_message2)
-
-        def should_checkpoint(self, checkpoint_context):
-            return True
-
-    a = CheckpointableFailedActor.remote()
-
-    # Make sure that we get errors from a failed constructor.
-    errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
-    assert len(errors) == 1
-    assert error_message1 in errors[0].error_message
-
-    # Make sure that we get errors from a failed method.
-    a.fail_method.remote()
-    errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR)
-    assert len(errors) == 1
-    assert error_message1 in errors[0].error_message
-
-
 def test_decorated_method(ray_start_regular):
    def method_invocation_decorator(f):
        def new_f_invocation(args, kwargs):
@@ -987,7 +632,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
            return self.dependency

    # Make sure it is scheduled in the second node.
-    @ray.remote(resources={"node": 1}, num_cpus=1)
+    @ray.remote(resources={"node": 1})
    class Owner:
        def get_pid(self):
            return os.getpid()
@@ -1004,7 +649,7 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
            # Wait until the `Caller` start executing the remote `call` method.
            ray.get(signal_handle.wait.remote())

-    @ray.remote
+    @ray.remote(resources={"caller": 1})
    class Caller:
        def call(self, owner_pid, signal_handle, actor_handle):
            # Notify the `Owner` that the `Caller` is executing the remote
@@ -1020,15 +665,15 @@ def test_actor_owner_node_dies_before_dependency_ready(ray_start_cluster):
            return True

    cluster = ray_start_cluster
-    node_to_be_broken = cluster.add_node(num_cpus=1, resources={"node": 1})
+    node_to_be_broken = cluster.add_node(resources={"node": 1})
+    cluster.add_node(resources={"caller": 1})

    owner = Owner.remote()
    owner_pid = ray.get(owner.get_pid.remote())

    caller = Caller.remote()
-    owner.create_actor.remote(caller)
+    ray.get(owner.create_actor.remote(caller))
    cluster.remove_node(node_to_be_broken)
-    # Wait for the `Owner` to exit.
    wait_for_pid_to_exit(owner_pid)

    # It will hang here if location is not properly resolved.
@@ -1,5 +1,4 @@
 import collections
-import json
 import os
 import pytest
 try:
@@ -241,9 +240,7 @@ def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster):
        cluster.add_node(
            num_cpus=10 * num_gpus_per_raylet,
            num_gpus=num_gpus_per_raylet,
-            _internal_config=json.dumps({
-                "num_heartbeats_timeout": 1000
-            } if i == 0 else {}))
+            _system_config={"num_heartbeats_timeout": 1000} if i == 0 else {})
    ray.init(address=cluster.address)

    @ray.remote
@@ -2,7 +2,6 @@
 import glob
 import logging
 import os
-import json
 import sys
 import socket
 import time
@@ -69,9 +68,9 @@ def test_local_scheduling_first(ray_start_cluster):
    # Disable worker caching.
    cluster.add_node(
        num_cpus=num_cpus,
-        _internal_config=json.dumps({
+        _system_config={
            "worker_lease_timeout_milliseconds": 0,
-        }))
+        })
    cluster.add_node(num_cpus=num_cpus)
    ray.init(address=cluster.address)

@@ -332,9 +331,7 @@ def test_wait_reconstruction(shutdown_only):
    ray.init(
        num_cpus=1,
        object_store_memory=int(10**8),
-        _internal_config=json.dumps({
-            "object_pinning_enabled": 0
-        }))
+        _system_config={"object_pinning_enabled": 0})

    @ray.remote
    def f():
@@ -607,11 +604,7 @@ def test_move_log_files_to_old(shutdown_only):


 def test_lease_request_leak(shutdown_only):
-    ray.init(
-        num_cpus=1,
-        _internal_config=json.dumps({
-            "initial_reconstruction_timeout_milliseconds": 200
-        }))
+    ray.init(num_cpus=1, _system_config={"object_timeout_milliseconds": 200})
    assert len(ray.objects()) == 0

    @ray.remote
@@ -3,7 +3,6 @@ import numpy as np
 from numpy.testing import assert_equal, assert_almost_equal
 import pytest
 import sys
-import json

 import ray
 import ray.experimental.array.remote as ra
@@ -59,13 +58,13 @@ def test_distributed_array_assemble(ray_start_2_cpus, reload_modules):
@pytest.mark.parametrize(
    "ray_start_cluster_2_nodes",
    [{
-        "_internal_config": json.dumps({
+        "_system_config": {
            # NOTE(swang): If plasma store notifications to the raylet for new
            # objects are delayed by long enough, then this causes concurrent
            # fetch calls to timeout and mistakenly mark the object as lost.
            # Set the timeout very high to prevent this.
-            "initial_reconstruction_timeout_milliseconds": 60000,
-        })
+            "object_timeout_milliseconds": 60000,
+        }
    }],
    indirect=True)
 def test_distributed_array_methods(ray_start_cluster_2_nodes, reload_modules):
@@ -1,6 +1,5 @@
 # coding: utf-8
 import io
-import json
 import logging
 import os
 import pickle
@@ -206,10 +205,7 @@ def test_background_tasks_with_max_calls(shutdown_only):


 def test_fair_queueing(shutdown_only):
-    ray.init(
-        num_cpus=1, _internal_config=json.dumps({
-            "fair_queueing_enabled": 1
-        }))
+    ray.init(num_cpus=1, _system_config={"fair_queueing_enabled": 1})

    @ray.remote
    def h():
@@ -1,5 +1,4 @@
 # coding: utf-8
-import json
 import logging
 import sys
 import threading
@@ -333,19 +332,16 @@ def test_call_chain(ray_start_cluster):
    assert ray.get(x) == 100


-def test_internal_config_when_connecting(ray_start_cluster):
-    config = json.dumps({
-        "object_pinning_enabled": 0,
-        "initial_reconstruction_timeout_milliseconds": 200
-    })
+def test_system_config_when_connecting(ray_start_cluster):
+    config = {"object_pinning_enabled": 0, "object_timeout_milliseconds": 200}
    cluster = ray.cluster_utils.Cluster()
    cluster.add_node(
-        _internal_config=config, object_store_memory=100 * 1024 * 1024)
+        _system_config=config, object_store_memory=100 * 1024 * 1024)
    cluster.wait_for_nodes()

-    # Specifying _internal_config when connecting to a cluster is disallowed.
+    # Specifying _system_config when connecting to a cluster is disallowed.
    with pytest.raises(ValueError):
-        ray.init(address=cluster.address, _internal_config=config)
+        ray.init(address=cluster.address, _system_config=config)

    # Check that the config was picked up (object pinning is disabled).
    ray.init(address=cluster.address)
@@ -1,4 +1,3 @@
-import json
 import os
 import signal
 import sys
@@ -138,9 +137,9 @@ def check_components_alive(cluster, component_type, check_component_alive):
    "ray_start_cluster", [{
        "num_cpus": 8,
        "num_nodes": 4,
-        "_internal_config": json.dumps({
+        "_system_config": {
            "num_heartbeats_timeout": 100
-        }),
+        },
    }],
    indirect=True)
 def test_raylet_failed(ray_start_cluster):
@@ -1,4 +1,3 @@
-import json
 import logging
 import os
 import sys
@@ -908,12 +907,12 @@ def test_raylet_crash_when_get(ray_start_regular):


 def test_connect_with_disconnected_node(shutdown_only):
-    config = json.dumps({
+    config = {
        "num_heartbeats_timeout": 50,
        "raylet_heartbeat_timeout_milliseconds": 10,
-    })
+    }
    cluster = Cluster()
-    cluster.add_node(num_cpus=0, _internal_config=config)
+    cluster.add_node(num_cpus=0, _system_config=config)
    ray.init(address=cluster.address)
    p = init_error_pubsub()
    errors = get_error_message(p, 1, timeout=5)
@@ -943,9 +942,9 @@ def test_connect_with_disconnected_node(shutdown_only):
    "ray_start_cluster_head", [{
        "num_cpus": 5,
        "object_store_memory": 10**8,
-        "_internal_config": json.dumps({
+        "_system_config": {
            "object_store_full_max_retries": 0
-        })
+        }
    }],
    indirect=True)
 def test_parallel_actor_fill_plasma_retry(ray_start_cluster_head):
@@ -965,9 +964,7 @@ def test_fill_object_store_exception(shutdown_only):
    ray.init(
        num_cpus=2,
        object_store_memory=10**8,
-        _internal_config=json.dumps({
-            "object_store_full_max_retries": 0
-        }))
+        _system_config={"object_store_full_max_retries": 0})

    @ray.remote
    def expensive_task():
@@ -997,14 +994,14 @@ def test_fill_object_store_exception(shutdown_only):


 def test_fill_object_store_lru_fallback(shutdown_only):
-    config = json.dumps({
+    config = {
        "free_objects_batch_size": 1,
-    })
+    }
    ray.init(
        num_cpus=2,
        object_store_memory=10**8,
        lru_evict=True,
-        _internal_config=config)
+        _system_config=config)

    @ray.remote
    def expensive_task():
@@ -1125,13 +1122,13 @@ def test_serialized_id(ray_start_cluster):
                         [(False, False), (False, True), (True, False),
                          (True, True)])
 def test_fate_sharing(ray_start_cluster, use_actors, node_failure):
-    config = json.dumps({
+    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
-    })
+    }
    cluster = Cluster()
    # Head node with no resources.
-    cluster.add_node(num_cpus=0, _internal_config=config)
+    cluster.add_node(num_cpus=0, _system_config=config)
    ray.init(address=cluster.address)
    # Node to place the parent actor.
    node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
@@ -3,7 +3,7 @@ import sys
 import ray
 import pytest
 from ray.test_utils import (
-    generate_internal_config_map,
+    generate_system_config_map,
    wait_for_condition,
    wait_for_pid_to_exit,
 )
@@ -22,7 +22,7 @@ def increase(x):

@pytest.mark.parametrize(
    "ray_start_regular",
-    [generate_internal_config_map(num_heartbeats_timeout=20)],
+    [generate_system_config_map(num_heartbeats_timeout=20)],
    indirect=True)
 def test_gcs_server_restart(ray_start_regular):
    actor1 = Increase.remote()
@@ -45,7 +45,7 @@ def test_gcs_server_restart(ray_start_regular):

@pytest.mark.parametrize(
    "ray_start_regular",
-    [generate_internal_config_map(num_heartbeats_timeout=20)],
+    [generate_system_config_map(num_heartbeats_timeout=20)],
    indirect=True)
 def test_gcs_server_restart_during_actor_creation(ray_start_regular):
    ids = []
@@ -64,7 +64,7 @@ def test_gcs_server_restart_during_actor_creation(ray_start_regular):

@pytest.mark.parametrize(
    "ray_start_cluster_head",
-    [generate_internal_config_map(num_heartbeats_timeout=20)],
+    [generate_system_config_map(num_heartbeats_timeout=20)],
    indirect=True)
 def test_node_failure_detector_when_gcs_server_restart(ray_start_cluster_head):
    """Checks that the node failure detector is correct when gcs server restart.
@@ -1,4 +1,3 @@
-import json
 import pytest
 try:
    import pytest_timeout
@@ -140,9 +139,9 @@ def test_load_report(shutdown_only, max_shapes):
    cluster = ray.init(
        num_cpus=1,
        resources={resource1: 1},
-        _internal_config=json.dumps({
+        _system_config={
            "max_resource_shapes_per_load_report": max_shapes,
-        }))
+        })
    redis = ray.services.create_redis_client(
        cluster["redis_address"],
        password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)
@@ -48,10 +48,7 @@ def _setup_cluster_for_test(ray_start_cluster):
    NUM_NODES = 2
    cluster = ray_start_cluster
    # Add a head node.
-    cluster.add_node(
-        _internal_config=json.dumps({
-            "metrics_report_interval_ms": 1000
-        }))
+    cluster.add_node(_system_config={"metrics_report_interval_ms": 1000})
    # Add worker nodes.
    [cluster.add_node() for _ in range(NUM_NODES - 1)]
    cluster.wait_for_nodes()
@@ -6,7 +6,7 @@ import ray
 import ray.ray_constants as ray_constants
 from ray.monitor import Monitor
 from ray.cluster_utils import Cluster
-from ray.test_utils import generate_internal_config_map, SignalActor
+from ray.test_utils import generate_system_config_map, SignalActor

 logger = logging.getLogger(__name__)

@@ -33,12 +33,11 @@ def test_shutdown():

@pytest.mark.parametrize(
    "ray_start_cluster_head", [
-        generate_internal_config_map(
-            num_heartbeats_timeout=20,
-            initial_reconstruction_timeout_milliseconds=12345)
+        generate_system_config_map(
+            num_heartbeats_timeout=20, object_timeout_milliseconds=12345)
    ],
    indirect=True)
-def test_internal_config(ray_start_cluster_head):
+def test_system_config(ray_start_cluster_head):
    """Checks that the internal configuration setting works.

    We set the cluster to timeout nodes after 2 seconds of no timeouts. We
@@ -52,8 +51,7 @@ def test_internal_config(ray_start_cluster_head):

    @ray.remote
    def f():
-        assert ray._config.initial_reconstruction_timeout_milliseconds(
-        ) == 12345
+        assert ray._config.object_timeout_milliseconds() == 12345
        assert ray._config.num_heartbeats_timeout() == 20

    ray.get([f.remote() for _ in range(5)])
@@ -1,5 +1,4 @@
 # coding: utf-8
-import json
 import os
 import sys

@@ -19,9 +18,7 @@ def test_initial_workers(shutdown_only):
    ray.init(
        num_cpus=1,
        include_dashboard=True,
-        _internal_config=json.dumps({
-            "enable_multi_tenancy": True
-        }))
+        _system_config={"enable_multi_tenancy": True})
    raylet = ray.nodes()[0]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    raylet["NodeManagerPort"])
@@ -43,11 +40,7 @@ def test_initial_workers(shutdown_only):
 # different drivers were scheduled to the same worker process, that is, tasks
 # of different jobs were not correctly isolated during execution.
 def test_multi_drivers(shutdown_only):
-    info = ray.init(
-        num_cpus=10,
-        _internal_config=json.dumps({
-            "enable_multi_tenancy": True
-        }))
+    info = ray.init(num_cpus=10, _system_config={"enable_multi_tenancy": True})

    driver_code = """
 import os
@@ -120,9 +113,7 @@ def test_worker_env(shutdown_only):
            "foo1": "bar1",
            "foo2": "bar2"
        }),
-        _internal_config=json.dumps({
-            "enable_multi_tenancy": True
-        }))
+        _system_config={"enable_multi_tenancy": True})

    @ray.remote
    def get_env(key):
@@ -1,4 +1,3 @@
-import json
 import os
 import signal
 import sys
@@ -145,10 +144,10 @@ def check_components_alive(cluster, component_type, check_component_alive):
    [{
        "num_cpus": 8,
        "num_nodes": 4,
-        "_internal_config": json.dumps({
+        "_system_config": {
            # Raylet codepath is not stable with a shorter timeout.
            "num_heartbeats_timeout": 10
-        }),
+        },
    }],
    indirect=True)
 def test_raylet_failed(ray_start_cluster):
@@ -1,4 +1,3 @@
-import json
 import os
 import sys
 import time
@@ -19,13 +18,13 @@ import ray.ray_constants as ray_constants
        "num_cpus": 1,
        "num_nodes": 4,
        "object_store_memory": 1000 * 1024 * 1024,
-        "_internal_config": json.dumps({
+        "_system_config": {
            # Raylet codepath is not stable with a shorter timeout.
            "num_heartbeats_timeout": 10,
            "object_manager_pull_timeout_ms": 1000,
            "object_manager_push_timeout_ms": 1000,
            "object_manager_repeated_push_delay_ms": 1000,
-        }),
+        },
    }],
    indirect=True)
 def test_object_reconstruction(ray_start_cluster):
@@ -1,5 +1,4 @@
 from collections import defaultdict
-import json
 import multiprocessing
 import numpy as np
 import pytest
@@ -207,14 +206,14 @@ def test_object_transfer_retry(ray_start_cluster):
    # Also, force the receiving object manager to retry the pull sooner. We
    # make the chunk size smaller in order to make it easier to test objects
    # with multiple chunks.
-    config = json.dumps({
+    config = {
        "object_manager_repeated_push_delay_ms": repeated_push_delay * 1000,
        "object_manager_pull_timeout_ms": repeated_push_delay * 1000 / 4,
        "object_manager_default_chunk_size": 1000
-    })
+    }
    object_store_memory = 150 * 1024 * 1024
    cluster.add_node(
-        object_store_memory=object_store_memory, _internal_config=config)
+        object_store_memory=object_store_memory, _system_config=config)
    cluster.add_node(num_gpus=1, object_store_memory=object_store_memory)
    ray.init(address=cluster.address)

@@ -17,10 +17,10 @@ def test_spill_objects_manually(shutdown_only):
                "directory_path": "/tmp"
            }
        },
-        _internal_config=json.dumps({
+        _system_config={
            "object_store_full_max_retries": 0,
            "max_io_workers": 4,
-        }))
+        })
    arr = np.random.rand(1024 * 1024)  # 8 MB data
    replay_buffer = []
    pinned_objects = set()
@@ -64,10 +64,10 @@ def test_spill_objects_manually_from_workers(shutdown_only):
                "directory_path": "/tmp"
            }
        },
-        _internal_config=json.dumps({
+        _system_config={
            "object_store_full_max_retries": 0,
            "max_io_workers": 4,
-        }))
+        })

    @ray.remote
    def _worker():
@@ -90,10 +90,10 @@ def test_spill_objects_manually_with_workers(shutdown_only):
                "directory_path": "/tmp"
            }
        },
-        _internal_config=json.dumps({
+        _system_config={
            "object_store_full_max_retries": 0,
            "max_io_workers": 4,
-        }))
+        })
    arrays = [np.random.rand(100 * 1024) for _ in range(50)]
    objects = [ray.put(arr) for arr in arrays]

@@ -117,7 +117,7 @@ def test_spill_objects_manually_with_workers(shutdown_only):
                "directory_path": "/tmp"
            }
        },
-        "_internal_config": json.dumps({
+        "_system_config": json.dumps({
            "object_store_full_max_retries": 0,
            "max_io_workers": 4,
        }),
@@ -159,7 +159,7 @@ def test_spill_objects_automatically(shutdown_only):
    # Limit our object store to 75 MiB of memory.
    ray.init(
        object_store_memory=75 * 1024 * 1024,
-        _internal_config=json.dumps({
+        _system_config=json.dumps({
            "max_io_workers": 4,
            "object_store_full_max_retries": 2,
            "object_store_full_initial_delay_ms": 10,
@@ -1,4 +1,3 @@
-import json
 import os
 import signal
 import sys
@@ -16,14 +15,14 @@ SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM


 def test_cached_object(ray_start_cluster):
-    config = json.dumps({
+    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
-        "initial_reconstruction_timeout_milliseconds": 200,
-    })
+        "object_timeout_milliseconds": 200,
+    }
    cluster = ray_start_cluster
    # Head node with no resources.
-    cluster.add_node(num_cpus=0, _internal_config=config)
+    cluster.add_node(num_cpus=0, _system_config=config)
    ray.init(address=cluster.address)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(
@@ -61,18 +60,17 @@ def test_reconstruction_cached_dependency(ray_start_cluster,
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
-        "initial_reconstruction_timeout_milliseconds": 200,
+        "object_timeout_milliseconds": 200,
    }
    # Workaround to reset the config to the default value.
    if not reconstruction_enabled:
        config["lineage_pinning_enabled"] = 0
-    config = json.dumps(config)

    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(
        num_cpus=0,
-        _internal_config=config,
+        _system_config=config,
        enable_object_reconstruction=reconstruction_enabled)
    ray.init(address=cluster.address)
    # Node to place the initial object.
@@ -121,18 +119,17 @@ def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled):
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
-        "initial_reconstruction_timeout_milliseconds": 200,
+        "object_timeout_milliseconds": 200,
    }
    # Workaround to reset the config to the default value.
    if not reconstruction_enabled:
        config["lineage_pinning_enabled"] = 0
-    config = json.dumps(config)

    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(
        num_cpus=0,
-        _internal_config=config,
+        _system_config=config,
        enable_object_reconstruction=reconstruction_enabled)
    ray.init(address=cluster.address)
    # Node to place the initial object.
@@ -171,18 +168,17 @@ def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
-        "initial_reconstruction_timeout_milliseconds": 200,
+        "object_timeout_milliseconds": 200,
    }
    # Workaround to reset the config to the default value.
    if not reconstruction_enabled:
        config["lineage_pinning_enabled"] = 0
-    config = json.dumps(config)

    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(
        num_cpus=0,
-        _internal_config=config,
+        _system_config=config,
        enable_object_reconstruction=reconstruction_enabled)
    ray.init(address=cluster.address)
    # Node to place the initial object.
@@ -229,18 +225,17 @@ def test_basic_reconstruction_actor_task(ray_start_cluster,
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
-        "initial_reconstruction_timeout_milliseconds": 200,
+        "object_timeout_milliseconds": 200,
    }
    # Workaround to reset the config to the default value.
    if not reconstruction_enabled:
        config["lineage_pinning_enabled"] = 0
-    config = json.dumps(config)

    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(
        num_cpus=0,
-        _internal_config=config,
+        _system_config=config,
        enable_object_reconstruction=reconstruction_enabled)
    ray.init(address=cluster.address)
    # Node to place the initial object.
@@ -303,18 +298,17 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster,
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
-        "initial_reconstruction_timeout_milliseconds": 200,
+        "object_timeout_milliseconds": 200,
    }
    # Workaround to reset the config to the default value.
    if not reconstruction_enabled:
        config["lineage_pinning_enabled"] = 0
-    config = json.dumps(config)

    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(
        num_cpus=0,
-        _internal_config=config,
+        _system_config=config,
        enable_object_reconstruction=reconstruction_enabled)
    ray.init(address=cluster.address)
    # Node to place the initial object.
@@ -384,18 +378,17 @@ def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
-        "initial_reconstruction_timeout_milliseconds": 200,
+        "object_timeout_milliseconds": 200,
    }
    # Workaround to reset the config to the default value.
    if not reconstruction_enabled:
        config["lineage_pinning_enabled"] = 0
-    config = json.dumps(config)

    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(
        num_cpus=0,
-        _internal_config=config,
+        _system_config=config,
        enable_object_reconstruction=reconstruction_enabled)
    ray.init(address=cluster.address)
    # Node to place the initial object.
@@ -445,18 +438,17 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
-        "initial_reconstruction_timeout_milliseconds": 200,
+        "object_timeout_milliseconds": 200,
    }
    # Workaround to reset the config to the default value.
    if not reconstruction_enabled:
        config["lineage_pinning_enabled"] = 0
-    config = json.dumps(config)

    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(
        num_cpus=0,
-        _internal_config=config,
+        _system_config=config,
        object_store_memory=10**8,
        enable_object_reconstruction=reconstruction_enabled)
    ray.init(address=cluster.address)
@@ -493,17 +485,17 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):


 def test_reconstruction_stress(ray_start_cluster):
-    config = json.dumps({
+    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_timeout_milliseconds": 100,
        "max_direct_call_object_size": 100,
        "task_retry_delay_ms": 100,
-        "initial_reconstruction_timeout_milliseconds": 200,
-    })
+        "object_timeout_milliseconds": 200,
+    }
    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(
-        num_cpus=0, _internal_config=config, enable_object_reconstruction=True)
+        num_cpus=0, _system_config=config, enable_object_reconstruction=True)
    ray.init(address=cluster.address)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(
@@ -1,6 +1,5 @@
 # coding: utf-8
 import copy
-import json
 import logging
 import os
 import time
@@ -18,14 +17,14 @@ logger = logging.getLogger(__name__)

@pytest.fixture
 def one_worker_100MiB(request):
-    config = json.dumps({
+    config = {
        "object_store_full_max_retries": 2,
        "task_retry_delay_ms": 0,
-    })
+    }
    yield ray.init(
        num_cpus=1,
        object_store_memory=100 * 1024 * 1024,
-        _internal_config=config)
+        _system_config=config)
    ray.shutdown()


@@ -245,9 +244,7 @@ def test_pending_task_dependency_pinning(one_worker_100MiB):
 def test_feature_flag(shutdown_only):
    ray.init(
        object_store_memory=100 * 1024 * 1024,
-        _internal_config=json.dumps({
-            "object_pinning_enabled": 0
-        }))
+        _system_config={"object_pinning_enabled": 0})

    @ray.remote
    def f(array):
@@ -1,5 +1,4 @@
 # coding: utf-8
-import json
 import logging
 import os
 import signal
@@ -20,15 +19,15 @@ logger = logging.getLogger(__name__)

@pytest.fixture
 def one_worker_100MiB(request):
-    config = json.dumps({
+    config = {
        "object_store_full_max_retries": 2,
        "task_retry_delay_ms": 0,
-        "initial_reconstruction_timeout_milliseconds": 1000,
-    })
+        "object_timeout_milliseconds": 1000,
+    }
    yield ray.init(
        num_cpus=1,
        object_store_memory=100 * 1024 * 1024,
-        _internal_config=config)
+        _system_config=config)
    ray.shutdown()


@@ -1,4 +1,3 @@
-import json
 import numpy as np
 import os
 import pytest
@@ -23,9 +22,9 @@ def ray_start_reconstruction(request):
            "num_cpus": 1,
            "object_store_memory": plasma_store_memory // num_nodes,
            "redis_max_memory": 10**7,
-            "_internal_config": json.dumps({
-                "initial_reconstruction_timeout_milliseconds": 200
-            })
+            "_system_config": {
+                "object_timeout_milliseconds": 200
+            }
        })
    for i in range(num_nodes - 1):
        cluster.add_node(
@@ -1,5 +1,4 @@
 import inspect
-import json
 import time
 import os
 import pytest
@@ -45,9 +44,9 @@ def _start_new_cluster():
        connect=True,
        head_node_args={
            "num_cpus": 1,
-            "_internal_config": json.dumps({
+            "_system_config": {
                "num_heartbeats_timeout": 10
-            })
+            }
        })
    # Pytest doesn't play nicely with imports
    register_trainable("__fake_remote", MockRemoteTrainer)
@@ -74,9 +73,9 @@ def start_connected_emptyhead_cluster():
        connect=True,
        head_node_args={
            "num_cpus": 0,
-            "_internal_config": json.dumps({
+            "_system_config": {
                "num_heartbeats_timeout": 10
-            })
+            }
        })
    # Pytest doesn't play nicely with imports
    _register_all()
@@ -1,5 +1,4 @@
 # coding: utf-8
-import json
 import unittest

 import ray
@@ -190,9 +189,9 @@ class RayExecutorQueueTest(unittest.TestCase):
            connect=True,
            head_node_args={
                "num_cpus": 1,
-                "_internal_config": json.dumps({
+                "_system_config": {
                    "num_heartbeats_timeout": 10
-                })
+                }
            })
        # Pytest doesn't play nicely with imports
        _register_all()
@@ -107,7 +107,6 @@ class Worker:
        self.actors = {}
        # Information used to maintain actor checkpoints.
        self.actor_checkpoint_info = {}
-        self.actor_task_counter = 0
        # When the worker is constructed. Record the original value of the
        # CUDA_VISIBLE_DEVICES environment variable.
        self.original_gpu_ids = ray.utils.get_cuda_visible_devices()
@@ -515,7 +514,7 @@ def init(address=None,
         load_code_from_local=False,
         java_worker_options=None,
         use_pickle=True,
-         _internal_config=None,
+         _system_config=None,
         lru_evict=False,
         enable_object_reconstruction=False,
         _metrics_export_port=None,
@@ -631,8 +630,9 @@ def init(address=None,
            module or from the GCS.
        java_worker_options: Overwrite the options to start Java workers.
        use_pickle: Deprecated.
-        _internal_config (str): JSON configuration for overriding
-            RayConfig defaults. For testing purposes ONLY.
+        _system_config (dict): Configuration for overriding RayConfig
+            defaults. Used to set system configuration and for experimental Ray
+            core feature flags.
        lru_evict (bool): If True, when an object store is full, it will evict
            objects in LRU order to make more space and when under memory
            pressure, ray.UnreconstructableError may be thrown. If False, then
@@ -706,8 +706,9 @@ def init(address=None,

    raylet_ip_address = node_ip_address

-    _internal_config = (json.loads(_internal_config)
-                        if _internal_config else {})
+    _system_config = _system_config or {}
+    if not isinstance(_system_config, dict):
+        raise TypeError("The _system_config must be a dict.")

    global _global_node
    if redis_address is None:
@@ -742,7 +743,7 @@ def init(address=None,
            load_code_from_local=load_code_from_local,
            java_worker_options=java_worker_options,
            start_initial_python_workers_for_first_job=True,
-            _internal_config=_internal_config,
+            _system_config=_system_config,
            lru_evict=lru_evict,
            enable_object_reconstruction=enable_object_reconstruction,
            metrics_export_port=_metrics_export_port,
@@ -798,9 +799,9 @@ def init(address=None,
        if java_worker_options is not None:
            raise ValueError("When connecting to an existing cluster, "
                             "java_worker_options must not be provided.")
-        if _internal_config is not None and len(_internal_config) != 0:
+        if _system_config is not None and len(_system_config) != 0:
            raise ValueError("When connecting to an existing cluster, "
-                             "_internal_config must not be provided.")
+                             "_system_config must not be provided.")
        if lru_evict:
            raise ValueError("When connecting to an existing cluster, "
                             "lru_evict must not be provided.")
@@ -818,7 +819,7 @@ def init(address=None,
            object_ref_seed=object_ref_seed,
            temp_dir=temp_dir,
            load_code_from_local=load_code_from_local,
-            _internal_config=_internal_config,
+            _system_config=_system_config,
            lru_evict=lru_evict,
            enable_object_reconstruction=enable_object_reconstruction,
            metrics_export_port=_metrics_export_port)
@@ -122,13 +122,13 @@ if __name__ == "__main__":
            object_spilling_config = {}
        external_storage.setup_external_storage(object_spilling_config)

-    internal_config = {}
+    system_config = {}
    if args.config_list is not None:
        config_list = args.config_list.split(",")
        if len(config_list) > 1:
            i = 0
            while i < len(config_list):
-                internal_config[config_list[i]] = config_list[i + 1]
+                system_config[config_list[i]] = config_list[i + 1]
                i += 2

    raylet_ip_address = args.raylet_ip_address
@@ -146,7 +146,7 @@ if __name__ == "__main__":
        temp_dir=args.temp_dir,
        load_code_from_local=args.load_code_from_local,
        metrics_agent_port=args.metrics_agent_port,
-        _internal_config=json.dumps(internal_config),
+        _system_config=system_config,
    )

    node = ray.node.Node(