Merge branch 'master' into py39

2026-06-29 02:30:34 +08:00 · 2020-12-16 11:04:27 -05:00
parent bd866d926d 91878d18b5
commit 7d8a008aeb
489 changed files with 36844 additions and 8744 deletions
@@ -101,7 +101,7 @@ from ray import util  # noqa: E402

 # Replaced with the current commit when building the wheels.
 __commit__ = "{{RAY_COMMIT_SHA}}"
-__version__ = "1.1.0.dev0"
+__version__ = "1.2.0.dev0"

 __all__ = [
    "__version__",
@@ -136,7 +136,7 @@ def find_redis_address(address=None):
    # --redis_address=123.456.78.910 --node_ip_address=123.456.78.910
    # --raylet_socket_name=... --store_socket_name=... --object_manager_port=0
    # --min_worker_port=10000 --max_worker_port=10999
-    # --node_manager_port=58578 --redis_port=6379 --num_initial_workers=8
+    # --node_manager_port=58578 --redis_port=6379
    # --maximum_startup_concurrency=8
    # --static_resource_list=node:123.456.78.910,1.0,object_store_memory,66
    # --config_list=plasma_store_as_thread,True
@@ -279,7 +279,8 @@ def get_address_info_from_redis_helper(redis_address,
 def get_address_info_from_redis(redis_address,
                                node_ip_address,
                                num_retries=5,
-                                redis_password=None):
+                                redis_password=None,
+                                no_warning=False):
    counter = 0
    while True:
        try:
@@ -290,10 +291,11 @@ def get_address_info_from_redis(redis_address,
                raise
            # Some of the information may not be in Redis yet, so wait a little
            # bit.
-            logger.warning(
-                "Some processes that the driver needs to connect to have "
-                "not registered with Redis, so retrying. Have you run "
-                "'ray start' on this node?")
+            if not no_warning:
+                logger.warning(
+                    "Some processes that the driver needs to connect to have "
+                    "not registered with Redis, so retrying. Have you run "
+                    "'ray start' on this node?")
            time.sleep(1)
        counter += 1

@@ -1251,13 +1253,11 @@ def start_raylet(redis_address,
                 stderr_file=None,
                 config=None,
                 java_worker_options=None,
-                 load_code_from_local=False,
                 huge_pages=False,
                 fate_share=None,
                 socket_to_use=None,
                 head_node=False,
-                 start_initial_python_workers_for_first_job=False,
-                 code_search_path=None):
+                 start_initial_python_workers_for_first_job=False):
    """Start a raylet, which is a combined local scheduler and object manager.

    Args:
@@ -1294,9 +1294,6 @@ def start_raylet(redis_address,
        config (dict|None): Optional Raylet configuration that will
            override defaults in RayConfig.
        java_worker_options (list): The command options for Java worker.
-        code_search_path (list): Code search path for worker. code_search_path
-            is added to worker command in non-multi-tenancy mode and job_config
-            in multi-tenancy mode.
    Returns:
        ProcessInfo for the process that was started.
    """
@@ -1309,7 +1306,6 @@ def start_raylet(redis_address,
        raise ValueError("Cannot use valgrind and profiler at the same time.")

    assert resource_spec.resolved()
-    num_initial_workers = resource_spec.num_cpus
    static_resources = resource_spec.to_resource_dict()

    # Limit the number of workers that can be started in parallel by the
@@ -1346,7 +1342,6 @@ def start_raylet(redis_address,
            raylet_name,
            redis_password,
            session_dir,
-            code_search_path,
        )
    else:
        java_worker_command = []
@@ -1366,15 +1361,18 @@ def start_raylet(redis_address,

    # Create the command that the Raylet will use to start workers.
    start_worker_command = [
-        sys.executable, worker_path, f"--node-ip-address={node_ip_address}",
+        sys.executable,
+        worker_path,
+        f"--node-ip-address={node_ip_address}",
        f"--node-manager-port={node_manager_port}",
        f"--object-store-name={plasma_store_name}",
-        f"--raylet-name={raylet_name}", f"--redis-address={redis_address}",
-        f"--config-list={config_str}", f"--temp-dir={temp_dir}",
-        f"--metrics-agent-port={metrics_agent_port}"
+        f"--raylet-name={raylet_name}",
+        f"--redis-address={redis_address}",
+        f"--config-list={config_str}",
+        f"--temp-dir={temp_dir}",
+        f"--metrics-agent-port={metrics_agent_port}",
+        "RAY_WORKER_DYNAMIC_OPTION_PLACEHOLDER",
    ]
-    if code_search_path:
-        start_worker_command.append(f"--code-search-path={code_search_path}")
    if redis_password:
        start_worker_command += [f"--redis-password={redis_password}"]

@@ -1389,12 +1387,6 @@ def start_raylet(redis_address,
    if max_worker_port is None:
        max_worker_port = 0

-    if code_search_path is not None and len(code_search_path) > 0:
-        load_code_from_local = True
-
-    if load_code_from_local:
-        start_worker_command += ["--load-code-from-local"]
-
    # Create agent command
    agent_command = [
        sys.executable,
@@ -1425,7 +1417,6 @@ def start_raylet(redis_address,
        f"--node_ip_address={node_ip_address}",
        f"--redis_address={gcs_ip_address}",
        f"--redis_port={gcs_port}",
-        f"--num_initial_workers={num_initial_workers}",
        f"--maximum_startup_concurrency={maximum_startup_concurrency}",
        f"--static_resource_list={resource_argument}",
        f"--config_list={config_str}",
@@ -1485,8 +1476,7 @@ def get_ray_jars_dir():

 def build_java_worker_command(java_worker_options, redis_address,
                              node_manager_port, plasma_store_name,
-                              raylet_name, redis_password, session_dir,
-                              code_search_path):
+                              raylet_name, redis_password, session_dir):
    """This method assembles the command used to start a Java worker.

    Args:
@@ -1497,7 +1487,6 @@ def build_java_worker_command(java_worker_options, redis_address,
        raylet_name (str): The name of the raylet socket to create.
        redis_password (str): The password of connect to redis.
        session_dir (str): The path of this session.
-        code_search_path (list): Teh job code search path.
    Returns:
        The command string for starting Java worker.
    """
@@ -1518,7 +1507,6 @@ def build_java_worker_command(java_worker_options, redis_address,
    pairs.append(("ray.home", RAY_HOME))
    pairs.append(("ray.logging.dir", os.path.join(session_dir, "logs")))
    pairs.append(("ray.session-dir", session_dir))
-    pairs.append(("ray.job.code-search-path", code_search_path))
    command = ["java"] + ["-D{}={}".format(*pair) for pair in pairs]

    command += ["RAY_WORKER_RAYLET_CONFIG_PLACEHOLDER"]
@@ -336,6 +336,7 @@ cdef execute_task(
        const c_vector[shared_ptr[CRayObject]] &c_args,
        const c_vector[CObjectID] &c_arg_reference_ids,
        const c_vector[CObjectID] &c_return_ids,
+        const c_string debugger_breakpoint,
        c_vector[shared_ptr[CRayObject]] *returns):

    worker = ray.worker.global_worker
@@ -351,6 +352,18 @@ cdef execute_task(
    # Automatically restrict the GPUs available to this task.
    ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())

+    # Helper method used to exit current asyncio actor.
+    # This is called when a KeyboardInterrupt is received by the main thread.
+    # Upon receiving a KeyboardInterrupt signal, Ray will exit the current
+    # worker. If the worker is processing normal tasks, Ray treat it as task
+    # cancellation from ray.cancel(object_ref). If the worker is an asyncio
+    # actor, Ray will exit the actor.
+    def exit_current_actor_if_asyncio():
+        if core_worker.current_actor_is_asyncio():
+            error = SystemExit(0)
+            error.is_ray_terminate = True
+            raise error
+
    function_descriptor = CFunctionDescriptorToPython(
        ray_function.GetFunctionDescriptor())

@@ -457,9 +470,26 @@ cdef execute_task(
                task_exception = True
                try:
                    with ray.worker._changeproctitle(title, next_title):
+                        if debugger_breakpoint != b"":
+                            ray.util.pdb.set_trace(
+                                breakpoint_uuid=debugger_breakpoint)
                        outputs = function_executor(*args, **kwargs)
+                        next_breakpoint = (
+                            ray.worker.global_worker.debugger_breakpoint)
+                        if next_breakpoint != b"":
+                            # If this happens, the user typed "remote" and
+                            # there were no more remote calls left in this
+                            # task. In that case we just exit the debugger.
+                            ray.experimental.internal_kv._internal_kv_put(
+                                "RAY_PDB_{}".format(next_breakpoint),
+                                "{\"exit_debugger\": true}")
+                            ray.experimental.internal_kv._internal_kv_del(
+                                "RAY_PDB_CONTINUE_{}".format(next_breakpoint)
+                            )
+                            ray.worker.global_worker.debugger_breakpoint = b""
                    task_exception = False
                except KeyboardInterrupt as e:
+                    exit_current_actor_if_asyncio()
                    raise TaskCancelledError(
                            core_worker.get_current_task_id())
                if c_return_ids.size() == 1:
@@ -467,6 +497,7 @@ cdef execute_task(
            # Check for a cancellation that was called when the function
            # was exiting and was raised after the except block.
            if not check_signals().ok():
+                exit_current_actor_if_asyncio()
                task_exception = True
                raise TaskCancelledError(
                            core_worker.get_current_task_id())
@@ -523,6 +554,7 @@ cdef CRayStatus task_execution_handler(
        const c_vector[shared_ptr[CRayObject]] &c_args,
        const c_vector[CObjectID] &c_arg_reference_ids,
        const c_vector[CObjectID] &c_return_ids,
+        const c_string debugger_breakpoint,
        c_vector[shared_ptr[CRayObject]] *returns) nogil:

    with gil:
@@ -532,7 +564,7 @@ cdef CRayStatus task_execution_handler(
                # it does, that indicates that there was an internal error.
                execute_task(task_type, task_name, ray_function, c_resources,
                             c_args, c_arg_reference_ids, c_return_ids,
-                             returns)
+                             debugger_breakpoint, returns)
            except Exception:
                traceback_str = traceback.format_exc() + (
                    "An unexpected internal error occurred while the worker "
@@ -1041,6 +1073,7 @@ cdef class CoreWorker:
                    PlacementGroupID placement_group_id,
                    int64_t placement_group_bundle_index,
                    c_bool placement_group_capture_child_tasks,
+                    c_string debugger_breakpoint,
                    override_environment_variables):
        cdef:
            unordered_map[c_string, double] c_resources
@@ -1059,15 +1092,18 @@ cdef class CoreWorker:
                language.lang, function_descriptor.descriptor)
            prepare_args(self, language, args, &args_vector)

-            with nogil:
-                CCoreWorkerProcess.GetCoreWorker().SubmitTask(
-                    ray_function, args_vector, CTaskOptions(
-                        name, num_returns, c_resources,
-                        c_override_environment_variables),
-                    &return_ids, max_retries,
-                    c_pair[CPlacementGroupID, int64_t](
-                        c_placement_group_id, placement_group_bundle_index),
-                    placement_group_capture_child_tasks)
+            # NOTE(edoakes): releasing the GIL while calling this method causes
+            # segfaults. See relevant issue for details:
+            # https://github.com/ray-project/ray/pull/12803
+            CCoreWorkerProcess.GetCoreWorker().SubmitTask(
+                ray_function, args_vector, CTaskOptions(
+                    name, num_returns, c_resources,
+                    c_override_environment_variables),
+                &return_ids, max_retries,
+                c_pair[CPlacementGroupID, int64_t](
+                    c_placement_group_id, placement_group_bundle_index),
+                placement_group_capture_child_tasks,
+                debugger_breakpoint)

            return VectorToObjectRefs(return_ids)

@@ -1170,6 +1206,21 @@ cdef class CoreWorker:
                CCoreWorkerProcess.GetCoreWorker().
                RemovePlacementGroup(c_placement_group_id))

+    def wait_placement_group_ready(self,
+                                   PlacementGroupID placement_group_id,
+                                   int32_t timeout_seconds):
+        cdef CRayStatus status
+        cdef CPlacementGroupID cplacement_group_id = (
+            CPlacementGroupID.FromBinary(placement_group_id.binary()))
+        cdef int ctimeout_seconds = timeout_seconds
+        with nogil:
+            status = CCoreWorkerProcess.GetCoreWorker() \
+                .WaitPlacementGroupReady(cplacement_group_id, ctimeout_seconds)
+            if status.IsNotFound():
+                raise Exception("Placement group {} does not exist.".format(
+                    placement_group_id))
+        return status.ok()
+
    def submit_actor_task(self,
                          Language language,
                          ActorID actor_id,
@@ -1193,12 +1244,14 @@ cdef class CoreWorker:
                language.lang, function_descriptor.descriptor)
            prepare_args(self, language, args, &args_vector)

-            with nogil:
-                CCoreWorkerProcess.GetCoreWorker().SubmitActorTask(
-                    c_actor_id,
-                    ray_function,
-                    args_vector, CTaskOptions(name, num_returns, c_resources),
-                    &return_ids)
+            # NOTE(edoakes): releasing the GIL while calling this method causes
+            # segfaults. See relevant issue for details:
+            # https://github.com/ray-project/ray/pull/12803
+            CCoreWorkerProcess.GetCoreWorker().SubmitActorTask(
+                c_actor_id,
+                ray_function,
+                args_vector, CTaskOptions(name, num_returns, c_resources),
+                &return_ids)

            return VectorToObjectRefs(return_ids)

@@ -1400,8 +1453,16 @@ cdef class CoreWorker:
                context = worker.get_serialization_context()
                serialized_object = context.serialize(output)
                data_sizes.push_back(serialized_object.total_bytes)
-                metadatas.push_back(
-                    string_to_buffer(serialized_object.metadata))
+                metadata = serialized_object.metadata
+                if ray.worker.global_worker.debugger_get_breakpoint:
+                    breakpoint = (
+                        ray.worker.global_worker.debugger_get_breakpoint)
+                    metadata += (
+                        b"," + ray_constants.OBJECT_METADATA_DEBUG_PREFIX +
+                        breakpoint.encode())
+                    # Reset debugging context of this worker.
+                    ray.worker.global_worker.debugger_get_breakpoint = b""
+                metadatas.push_back(string_to_buffer(metadata))
                serialized_objects.append(serialized_object)
                contained_ids.push_back(
                    ObjectRefsToVector(serialized_object.contained_object_refs)
@@ -1,6 +1,7 @@
 import inspect
 import logging
 import weakref
+import _thread

 import ray.ray_constants as ray_constants
 import ray._raylet
@@ -1006,6 +1007,7 @@ def exit_actor():
    """Intentionally exit the current actor.

    This function is used to disconnect an actor and exit the worker.
+    Any ``atexit`` handlers installed in the actor will be run.

    Raises:
        Exception: An exception is raised if this is a driver or this
@@ -1018,6 +1020,14 @@ def exit_actor():
        ray.disconnect()
        # Disconnect global state from GCS.
        ray.state.state.disconnect()
+
+        # In asyncio actor mode, we can't raise SystemExit because it will just
+        # quit the asycnio event loop thread, not the main thread. Instead, we
+        # raise an interrupt signal to the main thread to tell it to exit.
+        if worker.core_worker.current_actor_is_asyncio():
+            _thread.interrupt_main()
+            return
+
        # Set a flag to indicate this is an intentional actor exit. This
        # reduces log verbosity.
        exit = SystemExit(0)
@@ -13,18 +13,19 @@ import collections

 from ray.experimental.internal_kv import _internal_kv_put, \
    _internal_kv_initialized
-from ray.autoscaler.tags import (TAG_RAY_LAUNCH_CONFIG, TAG_RAY_RUNTIME_CONFIG,
-                                 TAG_RAY_FILE_MOUNTS_CONTENTS,
-                                 TAG_RAY_NODE_STATUS, TAG_RAY_NODE_KIND,
-                                 TAG_RAY_USER_NODE_TYPE, STATUS_UP_TO_DATE,
-                                 NODE_KIND_WORKER, NODE_KIND_UNMANAGED)
+from ray.autoscaler.tags import (
+    TAG_RAY_LAUNCH_CONFIG, TAG_RAY_RUNTIME_CONFIG,
+    TAG_RAY_FILE_MOUNTS_CONTENTS, TAG_RAY_NODE_STATUS, TAG_RAY_NODE_KIND,
+    TAG_RAY_USER_NODE_TYPE, STATUS_UP_TO_DATE, NODE_KIND_WORKER,
+    NODE_KIND_UNMANAGED, NODE_KIND_HEAD)
 from ray.autoscaler._private.providers import _get_node_provider
 from ray.autoscaler._private.updater import NodeUpdaterThread
 from ray.autoscaler._private.node_launcher import NodeLauncher
 from ray.autoscaler._private.resource_demand_scheduler import \
-    ResourceDemandScheduler, NodeType, NodeID
+    get_bin_pack_residual, ResourceDemandScheduler, NodeType, NodeID, NodeIP, \
+    ResourceDict
 from ray.autoscaler._private.util import ConcurrentCounter, validate_config, \
-    with_head_node_ip, hash_launch_conf, hash_runtime_conf, \
+    with_head_node_ip, hash_launch_conf, hash_runtime_conf, add_prefix, \
    DEBUG_AUTOSCALING_STATUS, DEBUG_AUTOSCALING_ERROR
 from ray.autoscaler._private.constants import \
    AUTOSCALER_MAX_NUM_FAILURES, AUTOSCALER_MAX_LAUNCH_BATCH, \
@@ -47,7 +48,7 @@ class StandardAutoscaler:
    There are two ways to start an autoscaling cluster: manually by running
    `ray start --head --autoscaling-config=/path/to/config.yaml` on a
    instance that has permission to launch other instances, or you can also use
-    `ray create_or_update /path/to/config.yaml` from your laptop, which will
+    `ray up /path/to/config.yaml` from your laptop, which will
    configure the right AWS/Cloud roles automatically.

    StandardAutoscaler's `update` method is periodically called by `monitor.py`
@@ -66,8 +67,11 @@ class StandardAutoscaler:
                 max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
                 max_failures=AUTOSCALER_MAX_NUM_FAILURES,
                 process_runner=subprocess,
-                 update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S):
+                 update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S,
+                 prefix_cluster_info=False):
        self.config_path = config_path
+        # Prefix each line of info string with cluster name if True
+        self.prefix_cluster_info = prefix_cluster_info
        # Keep this before self.reset (self.provider needs to be created
        # exactly once).
        self.provider = None
@@ -164,27 +168,35 @@ class StandardAutoscaler:
        last_used = self.load_metrics.last_used_time_by_ip
        horizon = now - (60 * self.config["idle_timeout_minutes"])

-        nodes_to_terminate = []
+        nodes_to_terminate: Dict[NodeID, bool] = []
        node_type_counts = collections.defaultdict(int)
        # Sort based on last used to make sure to keep min_workers that
        # were most recently used. Otherwise, _keep_min_workers_of_node_type
        # might keep a node that should be terminated.
-        for node_id in self._sort_based_on_last_used(nodes, last_used):
+        sorted_node_ids = self._sort_based_on_last_used(nodes, last_used)
+        # Don't terminate nodes needed by request_resources()
+        nodes_allowed_to_terminate: Dict[NodeID, bool] = {}
+        if self.resource_demand_vector:
+            nodes_allowed_to_terminate = self._get_nodes_allowed_to_terminate(
+                sorted_node_ids)
+
+        for node_id in sorted_node_ids:
            # Make sure to not kill idle node types if the number of workers
-            # of that type is lower/equal to the min_workers of that type.
-            if self._keep_min_worker_of_node_type(
-                    node_id,
-                    node_type_counts) and self.launch_config_ok(node_id):
+            # of that type is lower/equal to the min_workers of that type
+            # or it is needed for request_resources().
+            if (self._keep_min_worker_of_node_type(node_id, node_type_counts)
+                    or not nodes_allowed_to_terminate.get(
+                        node_id, True)) and self.launch_config_ok(node_id):
                continue

            node_ip = self.provider.internal_ip(node_id)
            if node_ip in last_used and last_used[node_ip] < horizon:
                logger.info("StandardAutoscaler: "
-                            "{}: Terminating idle node".format(node_id))
+                            "{}: Terminating idle node.".format(node_id))
                nodes_to_terminate.append(node_id)
            elif not self.launch_config_ok(node_id):
                logger.info("StandardAutoscaler: "
-                            "{}: Terminating outdated node".format(node_id))
+                            "{}: Terminating outdated node.".format(node_id))
                nodes_to_terminate.append(node_id)

        if nodes_to_terminate:
@@ -198,7 +210,7 @@ class StandardAutoscaler:
               len(nodes_to_terminate)) > self.config["max_workers"] and nodes:
            to_terminate = nodes.pop()
            logger.info("StandardAutoscaler: "
-                        "{}: Terminating unneeded node".format(to_terminate))
+                        "{}: Terminating unneeded node.".format(to_terminate))
            nodes_to_terminate.append(to_terminate)

        if nodes_to_terminate:
@@ -226,15 +238,23 @@ class StandardAutoscaler:
            if not updater.is_alive():
                completed.append(node_id)
        if completed:
+            nodes_to_terminate: List[NodeID] = []
            for node_id in completed:
                if self.updaters[node_id].exitcode == 0:
                    self.num_successful_updates[node_id] += 1
+                    # Mark the node as active to prevent the node recovery
+                    # logic immediately trying to restart Ray on the new node.
+                    self.load_metrics.mark_active(
+                        self.provider.internal_ip(node_id))
                else:
+                    logger.error(f"StandardAutoscaler: {node_id}: Terminating "
+                                 "failed to setup/initialize node.")
+                    nodes_to_terminate.append(node_id)
                    self.num_failed_updates[node_id] += 1
                del self.updaters[node_id]
-            # Mark the node as active to prevent the node recovery logic
-            # immediately trying to restart Ray on the new node.
-            self.load_metrics.mark_active(self.provider.internal_ip(node_id))
+            if nodes_to_terminate:
+                self.provider.terminate_nodes(nodes_to_terminate)
+
            nodes = self.workers()
            self.log_info_string(nodes)

@@ -266,14 +286,16 @@ class StandardAutoscaler:
                                 last_used: Dict[str, float]) -> List[NodeID]:
        """Sort the nodes based on the last time they were used.

-        The first item in the return list is the least recently used.
+        The first item in the return list is the most recently used.
        """
        updated_last_used = copy.deepcopy(last_used)
-        now = time.time()
+        # Add the unconnected nodes as the least recently used (the end of
+        # list). This prioritizes connected nodes.
+        least_recently_used = -1
        for node_id in nodes:
            node_ip = self.provider.internal_ip(node_id)
            if node_ip not in updated_last_used:
-                updated_last_used[node_ip] = now
+                updated_last_used[node_ip] = least_recently_used

        def last_time_used(node_id: NodeID):
            node_ip = self.provider.internal_ip(node_id)
@@ -281,9 +303,86 @@ class StandardAutoscaler:

        return sorted(nodes, key=last_time_used, reverse=True)

-    def _keep_min_worker_of_node_type(self, node_id: NodeID,
-                                      node_type_counts: Dict[NodeType, int]):
-        """Returns if workers of node_type should be terminated.
+    def _get_nodes_allowed_to_terminate(
+            self, sorted_node_ids: List[NodeID]) -> Dict[NodeID, bool]:
+        # TODO(ameer): try merging this with resource_demand_scheduler
+        # code responsible for adding nodes for request_resources().
+        """Returns the nodes allowed to terminate for request_resources().
+
+        Args:
+            sorted_node_ids: the node ids sorted based on last used (LRU last).
+
+        Returns:
+            nodes_allowed_to_terminate: whether the node id is allowed to
+                terminate or not.
+        """
+        nodes_allowed_to_terminate: Dict[NodeID, bool] = {}
+        head_node_resources: ResourceDict = copy.deepcopy(
+            self.available_node_types[self.config["head_node_type"]][
+                "resources"])
+        if not head_node_resources:
+            # Legacy yaml might include {} in the resources field.
+            # TODO(ameer): this is somewhat duplicated in
+            # resource_demand_scheduler.py.
+            head_id: List[NodeID] = self.provider.non_terminated_nodes({
+                TAG_RAY_NODE_KIND: NODE_KIND_HEAD
+            })
+            if head_id:
+                head_ip = self.provider.internal_ip(head_id[0])
+                static_nodes: Dict[
+                    NodeIP,
+                    ResourceDict] = \
+                    self.load_metrics.get_static_node_resources_by_ip()
+                head_node_resources = static_nodes[head_ip]
+            else:
+                head_node_resources = {}
+
+        max_node_resources: List[ResourceDict] = [head_node_resources]
+        resource_demand_vector_worker_node_ids = []
+        # Get max resources on all the non terminated nodes.
+        for node_id in sorted_node_ids:
+            tags = self.provider.node_tags(node_id)
+            if TAG_RAY_USER_NODE_TYPE in tags:
+                node_type = tags[TAG_RAY_USER_NODE_TYPE]
+                node_resources: ResourceDict = copy.deepcopy(
+                    self.available_node_types[node_type]["resources"])
+                if not node_resources:
+                    # Legacy yaml might include {} in the resources field.
+                    static_nodes: Dict[
+                        NodeIP,
+                        ResourceDict] = \
+                            self.load_metrics.get_static_node_resources_by_ip()
+                    node_ip = self.provider.internal_ip(node_id)
+                    node_resources = static_nodes.get(node_ip, {})
+                max_node_resources.append(node_resources)
+                resource_demand_vector_worker_node_ids.append(node_id)
+        # Since it is sorted based on last used, we "keep" nodes that are
+        # most recently used when we binpack. We assume get_bin_pack_residual
+        # is following the given order here.
+        used_resource_requests: List[ResourceDict]
+        _, used_resource_requests = \
+            get_bin_pack_residual(max_node_resources,
+                                  self.resource_demand_vector)
+        # Remove the first entry (the head node).
+        max_node_resources.pop(0)
+        # Remove the first entry (the head node).
+        used_resource_requests.pop(0)
+        for i, node_id in enumerate(resource_demand_vector_worker_node_ids):
+            if used_resource_requests[i] == max_node_resources[i] \
+                    and max_node_resources[i]:
+                # No resources of the node were needed for request_resources().
+                # max_node_resources[i] is an empty dict for legacy yamls
+                # before the node is connected.
+                nodes_allowed_to_terminate[node_id] = True
+            else:
+                nodes_allowed_to_terminate[node_id] = False
+        return nodes_allowed_to_terminate
+
+    def _keep_min_worker_of_node_type(
+            self, node_id: NodeID,
+            node_type_counts: Dict[NodeType, int]) -> bool:
+        """Returns if workers of node_type can be terminated.
+        The worker cannot be terminated to respect min_workers constraint.

        Receives the counters of running nodes so far and determines if idle
        node_id should be terminated or not. It also updates the counters
@@ -293,7 +392,7 @@ class StandardAutoscaler:
            node_type_counts(Dict[NodeType, int]): The non_terminated node
                types counted so far.
        Returns:
-            bool: if workers of node_types should be terminated or not.
+            bool: if workers of node_types can be terminated or not.
        """
        tags = self.provider.node_tags(node_id)
        if TAG_RAY_USER_NODE_TYPE in tags:
@@ -589,6 +688,8 @@ class StandardAutoscaler:
            self.load_metrics.get_resource_utilization())
        if _internal_kv_initialized():
            _internal_kv_put(DEBUG_AUTOSCALING_STATUS, tmp, overwrite=True)
+        if self.prefix_cluster_info:
+            tmp = add_prefix(tmp, self.config["cluster_name"])
        logger.debug(tmp)

    def info_string(self, nodes):
@@ -29,8 +29,6 @@ from ray.autoscaler._private.subprocess_output_util import (
 from ray.autoscaler._private.cli_logger import cli_logger, cf
 from ray.util.debug import log_once

-from ray.autoscaler._private.constants import RAY_HOME
-
 logger = logging.getLogger(__name__)

 # How long to wait for a node to start, in seconds
@@ -114,6 +112,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
        self.node_id = str(node_id)
        self.namespace = namespace
        self.kubectl = ["kubectl", "-n", self.namespace]
+        self._home_cached = None

    def run(
            self,
@@ -195,7 +194,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
                logger.warning("'rsync_filter' detected but is currently "
                               "unsupported for k8s.")
        if target.startswith("~"):
-            target = RAY_HOME + target[1:]
+            target = self._home + target[1:]

        try:
            flags = "-aqz" if is_rsync_silent() else "-avz"
@@ -211,7 +210,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
                "rsync failed: '{}'. Falling back to 'kubectl cp'".format(e),
                UserWarning)
            if target.startswith("~"):
-                target = RAY_HOME + target[1:]
+                target = self._home + target[1:]

            self.process_runner.check_call(self.kubectl + [
                "cp", source, "{}/{}:{}".format(self.namespace, self.node_id,
@@ -219,8 +218,8 @@ class KubernetesCommandRunner(CommandRunnerInterface):
            ])

    def run_rsync_down(self, source, target, options=None):
-        if target.startswith("~"):
-            target = RAY_HOME + target[1:]
+        if source.startswith("~"):
+            source = self._home + source[1:]

        try:
            flags = "-aqz" if is_rsync_silent() else "-avz"
@@ -236,7 +235,7 @@ class KubernetesCommandRunner(CommandRunnerInterface):
                "rsync failed: '{}'. Falling back to 'kubectl cp'".format(e),
                UserWarning)
            if target.startswith("~"):
-                target = RAY_HOME + target[1:]
+                target = self._home + target[1:]

            self.process_runner.check_call(self.kubectl + [
                "cp", "{}/{}:{}".format(self.namespace, self.node_id, source),
@@ -244,8 +243,21 @@ class KubernetesCommandRunner(CommandRunnerInterface):
            ])

    def remote_shell_command_str(self):
-        return "{} exec -it {} bash".format(" ".join(self.kubectl),
-                                            self.node_id)
+        return "{} exec -it {} -- bash".format(" ".join(self.kubectl),
+                                               self.node_id)
+
+    @property
+    def _home(self):
+        # TODO (Dmitri): Think about how to use the node's HOME variable
+        # without making an extra kubectl exec call.
+        if self._home_cached is None:
+            cmd = self.kubectl + [
+                "exec", "-it", self.node_id, "--", "printenv", "HOME"
+            ]
+            joined_cmd = " ".join(cmd)
+            raw_out = self.process_runner.check_output(joined_cmd, shell=True)
+            self._home_cached = raw_out.decode().strip("\n\r")
+        return self._home_cached


 class SSHOptions:
@@ -5,6 +5,7 @@ _configured = False
 _core_api = None
 _auth_api = None
 _extensions_beta_api = None
+_custom_objects_api = None


 def _load_config():
@@ -45,4 +46,13 @@ def extensions_beta_api():
    return _extensions_beta_api


+def custom_objects_api():
+    global _custom_objects_api
+    if _custom_objects_api is None:
+        _load_config()
+        _custom_objects_api = kubernetes.client.CustomObjectsApi()
+
+    return _custom_objects_api
+
+
 log_prefix = "KubernetesNodeProvider: "
@@ -1,4 +1,6 @@
+import copy
 import logging
+import math

 from kubernetes import client
 from kubernetes.client.rest import ApiException
@@ -45,9 +47,10 @@ def not_provided_msg(resource_type):

 def bootstrap_kubernetes(config):
    if not config["provider"]["use_internal_ips"]:
-        return ValueError("Exposing external IP addresses for ray pods isn't "
-                          "currently supported. Please set "
-                          "'use_internal_ips' to false.")
+        return ValueError(
+            "Exposing external IP addresses for ray containers isn't "
+            "currently supported. Please set "
+            "'use_internal_ips' to false.")
    namespace = _configure_namespace(config["provider"])
    _configure_autoscaler_service_account(namespace, config["provider"])
    _configure_autoscaler_role(namespace, config["provider"])
@@ -56,6 +59,62 @@ def bootstrap_kubernetes(config):
    return config


+def fillout_resources_kubernetes(config):
+    if "available_node_types" not in config:
+        return config["available_node_types"]
+    node_types = copy.deepcopy(config["available_node_types"])
+    for node_type in node_types:
+        container_data = node_types[node_type]["node_config"]["spec"][
+            "containers"][0]
+        autodetected_resources = get_autodetected_resources(container_data)
+        if "resources" not in config["available_node_types"][node_type]:
+            config["available_node_types"][node_type]["resources"] = {}
+        config["available_node_types"][node_type]["resources"].update(
+            autodetected_resources)
+        logger.debug(
+            "Updating the resources of node type {} to include {}.".format(
+                node_type, autodetected_resources))
+    return config
+
+
+def get_autodetected_resources(container_data):
+    container_resources = container_data.get("resources", None)
+    if container_resources is None:
+        return {"CPU": 0, "GPU": 0}
+
+    node_type_resources = {
+        resource_name.upper(): get_resource(container_resources, resource_name)
+        for resource_name in ["cpu", "gpu"]
+    }
+
+    return node_type_resources
+
+
+def get_resource(container_resources, resource_name):
+    request = _get_resource(
+        container_resources, resource_name, field_name="requests")
+    limit = _get_resource(
+        container_resources, resource_name, field_name="limits")
+    resource = min(request, limit)
+    return 0 if resource == float("inf") else int(resource)
+
+
+def _get_resource(container_resources, resource_name, field_name):
+    if (field_name in container_resources
+            and resource_name in container_resources[field_name]):
+        return _parse_resource(container_resources[field_name][resource_name])
+    else:
+        return float("inf")
+
+
+def _parse_resource(resource):
+    resource_str = str(resource)
+    if resource_str[-1] == "m":
+        return math.ceil(int(resource_str[:-1]) / 1000)
+    else:
+        return int(resource_str)
+
+
 def _configure_namespace(provider_config):
    namespace_field = "namespace"
    if namespace_field not in provider_config:
@@ -6,7 +6,8 @@ from kubernetes.client.rest import ApiException
 from ray.autoscaler._private.command_runner import KubernetesCommandRunner
 from ray.autoscaler._private.kubernetes import core_api, log_prefix, \
    extensions_beta_api
-from ray.autoscaler._private.kubernetes.config import bootstrap_kubernetes
+from ray.autoscaler._private.kubernetes.config import bootstrap_kubernetes, \
+    fillout_resources_kubernetes
 from ray.autoscaler.node_provider import NodeProvider
 from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME

@@ -177,6 +178,11 @@ class KubernetesNodeProvider(NodeProvider):
    def bootstrap_config(cluster_config):
        return bootstrap_kubernetes(cluster_config)

+    @staticmethod
+    def fillout_available_node_types_resources(cluster_config):
+        """Fills out missing "resources" field for available_node_types."""
+        return fillout_resources_kubernetes(cluster_config)
+

 def _add_service_name_to_service_port(spec, svc_name):
    """Goes recursively through the ingress manifest and adds the
@@ -82,12 +82,14 @@ class LoadMetrics:
        def prune(mapping):
            unwanted = set(mapping) - active_ips
            for unwanted_key in unwanted:
-                logger.info("LoadMetrics: "
-                            "Removed mapping: {} - {}".format(
-                                unwanted_key, mapping[unwanted_key]))
+                # TODO (Alex): Change this back to info after #12138.
+                logger.debug("LoadMetrics: "
+                             "Removed mapping: {} - {}".format(
+                                 unwanted_key, mapping[unwanted_key]))
                del mapping[unwanted_key]
            if unwanted:
-                logger.info(
+                # TODO (Alex): Change this back to info after #12138.
+                logger.debug(
                    "LoadMetrics: "
                    "Removed {} stale ip mappings: {} not in {}".format(
                        len(unwanted), unwanted, active_ips))
@@ -135,24 +135,6 @@ class ResourceDemandScheduler:
                this set of resources. This differs from resources_demands in
                that we don't take into account existing usage.
        """
-
-        # If the user is using request_resources() API, calculate the remaining
-        # delta resources required to meet their requested cluster size.
-        if ensure_min_cluster_size is not None:
-            used_resources = []
-            for ip, max_res in max_resources_by_ip.items():
-                res = copy.deepcopy(max_res)
-                _inplace_subtract(res, unused_resources_by_ip.get(ip, {}))
-                used_resources.append(res)
-            # Example: user requests 1000 CPUs, but the cluster is currently
-            # 500 CPUs in size with 250 used. Then, the delta is 750 CPUs that
-            # we need to fit to get the cluster to scale to 1000.
-            resource_requests, _ = get_bin_pack_residual(
-                used_resources, ensure_min_cluster_size)
-            resource_demands += resource_requests
-        else:
-            resource_requests = []
-
        if self.is_legacy_yaml():
            # When using legacy yaml files we need to infer the head & worker
            # node resources from the static node resources from LoadMetrics.
@@ -166,9 +148,12 @@ class ResourceDemandScheduler:
        logger.info("Cluster resources: {}".format(node_resources))
        logger.info("Node counts: {}".format(node_type_counts))
        # Step 2: add nodes to add to satisfy min_workers for each type
-        node_resources, node_type_counts, min_workers_nodes_to_add = \
+        (node_resources,
+         node_type_counts,
+         adjusted_min_workers) = \
            _add_min_workers_nodes(
-                node_resources, node_type_counts, self.node_types)
+                node_resources, node_type_counts, self.node_types,
+                self.max_workers, ensure_min_cluster_size)

        # Step 3: add nodes for strict spread groups
        logger.info(f"Placement group demands: {pending_placement_groups}")
@@ -180,8 +165,16 @@ class ResourceDemandScheduler:
                not self.node_types[NODE_TYPE_LEGACY_WORKER]["resources"]:
            # Need to launch worker nodes to later infer their
            # resources.
+            # We add request_resources() demands here to make sure we launch
+            # a single worker sometimes even if min_workers = 0 and resource
+            # demands is empty.
+            if ensure_min_cluster_size:
+                request_resources_demands = ensure_min_cluster_size
+            else:
+                request_resources_demands = []
            return self._legacy_worker_node_to_launch(
-                nodes, launching_nodes, node_resources, resource_demands)
+                nodes, launching_nodes, node_resources,
+                resource_demands + request_resources_demands)
        placement_group_nodes_to_add, node_resources, node_type_counts = \
            self.reserve_and_allocate_spread(
                strict_spreads, node_resources, node_type_counts)
@@ -194,20 +187,15 @@ class ResourceDemandScheduler:
        logger.info("Unfulfilled demands: {}".format(unfulfilled))
        # Add 1 to account for the head node.
        max_to_add = self.max_workers + 1 - sum(node_type_counts.values())
-        if resource_requests:
-            nodes_to_add_based_on_requests = get_nodes_for(
-                self.node_types, node_type_counts, max_to_add,
-                resource_requests)
-        else:
-            nodes_to_add_based_on_requests = {}
        nodes_to_add_based_on_demand = get_nodes_for(
            self.node_types, node_type_counts, max_to_add, unfulfilled)
        # Merge nodes to add based on demand and nodes to add based on
        # min_workers constraint. We add them because nodes to add based on
        # demand was calculated after the min_workers constraint was respected.
        total_nodes_to_add = {}
+
        for node_type in self.node_types:
-            nodes_to_add = (min_workers_nodes_to_add.get(
+            nodes_to_add = (adjusted_min_workers.get(
                node_type, 0) + placement_group_nodes_to_add.get(node_type, 0)
                            + nodes_to_add_based_on_demand.get(node_type, 0))
            if nodes_to_add > 0:
@@ -216,7 +204,7 @@ class ResourceDemandScheduler:
        # Limit the number of concurrent launches
        total_nodes_to_add = self._get_concurrent_resource_demand_to_launch(
            total_nodes_to_add, unused_resources_by_ip.keys(), nodes,
-            launching_nodes, nodes_to_add_based_on_requests)
+            launching_nodes, adjusted_min_workers)

        logger.info("Node requests: {}".format(total_nodes_to_add))
        return total_nodes_to_add
@@ -294,7 +282,7 @@ class ResourceDemandScheduler:
            connected_nodes: List[NodeIP],
            non_terminated_nodes: List[NodeID],
            pending_launches_nodes: Dict[NodeType, int],
-            nodes_to_add_based_on_requests: Dict[NodeType, int],
+            adjusted_min_workers: Dict[NodeType, int],
    ) -> Dict[NodeType, int]:
        """Updates the max concurrent resources to launch for each node type.

@@ -314,9 +302,10 @@ class ResourceDemandScheduler:
            connected_nodes: Running nodes (from LoadMetrics).
            non_terminated_nodes: Non terminated nodes (pending/running).
            pending_launches_nodes: Nodes that are in the launch queue.
-            nodes_to_add_based_on_requests: Nodes to launch to satisfy
-                request_resources(). This overrides the launch limits since the
-                user is hinting to immediately scale up to this size.
+            adjusted_min_workers: Nodes to launch to satisfy
+                min_workers and request_resources(). This overrides the launch
+                limits since the user is hinting to immediately scale up to
+                this size.
        Returns:
            Dict[NodeType, int]: Maximum number of nodes to launch for each
                node type.
@@ -338,13 +327,9 @@ class ResourceDemandScheduler:
            upper_bound = max(
                max_allowed_pending_nodes - total_pending_nodes,

-                # Allow more nodes if this is to respect min_workers.
-                self.node_types[node_type].get("min_workers", 0) -
-                total_pending_nodes - running_nodes[node_type],
-
-                # Allow more nodes from request_resources API.
-                nodes_to_add_based_on_requests.get(node_type,
-                                                   0) - total_pending_nodes)
+                # Allow more nodes if this is to respect min_workers or
+                # request_resources().
+                adjusted_min_workers.get(node_type, 0))

            if upper_bound > 0:
                updated_nodes_to_launch[node_type] = min(
@@ -504,21 +489,26 @@ def _node_type_counts_to_node_resources(
 def _add_min_workers_nodes(
        node_resources: List[ResourceDict],
        node_type_counts: Dict[NodeType, int],
-        node_types: Dict[NodeType, NodeTypeConfigDict],
+        node_types: Dict[NodeType, NodeTypeConfigDict], max_workers: int,
+        ensure_min_cluster_size: List[ResourceDict]
 ) -> (List[ResourceDict], Dict[NodeType, int], Dict[NodeType, int]):
-    """Updates resource demands to respect the min_workers constraint.
+    """Updates resource demands to respect the min_workers and
+    request_resources() constraints.

    Args:
        node_resources: Resources of exisiting nodes already launched/pending.
        node_type_counts: Counts of existing nodes already launched/pending.
        node_types: Node types config.
+        max_workers: global max_workers constaint.
+        ensure_min_cluster_size: resource demands from request_resources().

    Returns:
        node_resources: The updated node resources after adding min_workers
-            constraint per node type.
+            and request_resources() constraints per node type.
        node_type_counts: The updated node counts after adding min_workers
-            constraint per node type.
-        total_nodes_to_add: The nodes to add to respect min_workers constraint.
+            and request_resources() constraints per node type.
+        total_nodes_to_add_dict: The nodes to add to respect min_workers and
+            request_resources() constraints.
    """
    total_nodes_to_add_dict = {}
    for node_type, config in node_types.items():
@@ -528,10 +518,41 @@ def _add_min_workers_nodes(
        if existing < target:
            total_nodes_to_add_dict[node_type] = target - existing
            node_type_counts[node_type] = target
-            available = copy.deepcopy(node_types[node_type]["resources"])
-            node_resources.extend(
-                [available] * total_nodes_to_add_dict[node_type])
+            node_resources.extend([
+                copy.deepcopy(node_types[node_type]["resources"])
+                for _ in range(total_nodes_to_add_dict[node_type])
+            ])

+    if ensure_min_cluster_size:
+        max_to_add = max_workers + 1 - sum(node_type_counts.values())
+        max_node_resources = []
+        # Fit request_resources() on all the resources as if they are idle.
+        for node_type in node_type_counts:
+            max_node_resources.extend([
+                copy.deepcopy(node_types[node_type]["resources"])
+                for _ in range(node_type_counts[node_type])
+            ])
+        # Get the unfulfilled to ensure min cluster size.
+        resource_requests_unfulfilled, _ = get_bin_pack_residual(
+            max_node_resources, ensure_min_cluster_size)
+        # Get the nodes to meet the unfulfilled.
+        nodes_to_add_request_resources = get_nodes_for(
+            node_types, node_type_counts, max_to_add,
+            resource_requests_unfulfilled)
+        # Update the resources, counts and total nodes to add.
+        for node_type in nodes_to_add_request_resources:
+            nodes_to_add = nodes_to_add_request_resources.get(node_type, 0)
+            if nodes_to_add > 0:
+                node_type_counts[
+                    node_type] = nodes_to_add + node_type_counts.get(
+                        node_type, 0)
+                node_resources.extend([
+                    copy.deepcopy(node_types[node_type]["resources"])
+                    for _ in range(nodes_to_add)
+                ])
+                total_nodes_to_add_dict[
+                    node_type] = nodes_to_add + total_nodes_to_add_dict.get(
+                        node_type, 0)
    return node_resources, node_type_counts, total_nodes_to_add_dict


@@ -623,7 +644,8 @@ def _utilization_score(node_resources: ResourceDict,

 def get_bin_pack_residual(node_resources: List[ResourceDict],
                          resource_demands: List[ResourceDict],
-                          strict_spread: bool = False) -> List[ResourceDict]:
+                          strict_spread: bool = False
+                          ) -> (List[ResourceDict], List[ResourceDict]):
    """Return a subset of resource_demands that cannot fit in the cluster.

    TODO(ekl): this currently does not guarantee the resources will be packed
@@ -638,7 +660,7 @@ def get_bin_pack_residual(node_resources: List[ResourceDict],
            placed on a different entry in `node_resources`.

    Returns:
-        List[ResourceDict] the residual list resources that do not fit.
+        List[ResourceDict]: the residual list resources that do not fit.
        List[ResourceDict]: The updated node_resources after the method.
    """

@@ -256,8 +256,16 @@ class NodeUpdater:

                        retry_str = "(" + str(e) + ")"
                        if hasattr(e, "cmd"):
+                            if isinstance(e.cmd, str):
+                                cmd_ = e.cmd
+                            elif isinstance(e.cmd, list):
+                                cmd_ = " ".join(e.cmd)
+                            else:
+                                logger.debug(f"e.cmd type ({type(e.cmd)}) not "
+                                             "list or str.")
+                                cmd_ = str(e.cmd)
                            retry_str = "(Exit Status {}): {}".format(
-                                e.returncode, " ".join(e.cmd))
+                                e.returncode, cmd_)

                        cli_logger.print(
                            "SSH still not available {}, "
@@ -244,3 +244,14 @@ def hash_runtime_conf(file_mounts,
        file_mounts_contents_hash = None

    return (_hash_cache[conf_str], file_mounts_contents_hash)
+
+
+def add_prefix(info_string, prefix):
+    """Prefixes each line of info_string, except the first, by prefix."""
+    lines = info_string.split("\n")
+    prefixed_lines = [lines[0]]
+    for line in lines[1:]:
+        prefixed_line = ":".join([prefix, line])
+        prefixed_lines.append(prefixed_line)
+    prefixed_info_string = "\n".join(prefixed_lines)
+    return prefixed_info_string
@@ -112,7 +112,7 @@ setup_commands:
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
-    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
    # Consider uncommenting these if you also want to run apt-get commands during setup
    # - sudo pkill -9 apt-get || true
    # - sudo pkill -9 dpkg || true
@@ -19,7 +19,8 @@ upscaling_speed: 1.0
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "rayproject/ray:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
@@ -27,10 +28,10 @@ docker:
    run_options: []  # Extra options to pass into "docker run"

    # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:latest-gpu"
+    # head_image: "rayproject/ray-ml:latest-gpu"
    # Allow Ray to automatically detect GPUs

-    # worker_image: "rayproject/ray:latest-cpu"
+    # worker_image: "rayproject/ray-ml:latest-cpu"
    # worker_run_options: []

 # If a node is idle for this many minutes, it will be removed.
@@ -128,7 +129,7 @@ setup_commands: []
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
-    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl

 # Custom commands that will be run on the head node after common setup.
 head_setup_commands: []
@@ -19,13 +19,14 @@ upscaling_speed: 1.0
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "rayproject/ray:latest-gpu"
+    image: "rayproject/ray-ml:latest-gpu"
+    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_nvidia_docker" # e.g. ray_docker

    # # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:latest-gpu"
+    # head_image: "rayproject/ray-ml:latest-gpu"

-    # worker_image: "rayproject/ray:latest"
+    # worker_image: "rayproject/ray-ml:latest"

 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
@@ -90,8 +91,8 @@ file_mounts: {
 # List of shell commands to run to set up nodes.
 # NOTE: rayproject/ray:latest has ray latest bundled
 setup_commands: []
-    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
-    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl

 # Custom commands that will be run on the head node after common setup.
 head_setup_commands:
@@ -2,7 +2,7 @@
 cluster_name: java
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
-min_workers: 1  
+min_workers: 1
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers.
 max_workers: 1
@@ -72,10 +72,10 @@ worker_setup_commands: []
 # Command to start ray on the head node. You don't need to change this.
 head_start_ray_commands:
    - ray stop
-    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --code-search-path=~/ray-word-count/target
+    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
 # Command to start ray on worker nodes. You don't need to change this.
 worker_start_ray_commands:
    - ray stop
-    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --code-search-path=ray-word-count/target
+    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

-# To run the program, run `ray exec java.yaml "java -jar ray-word-count/target/ray-word-count-1.0-SNAPSHOT-jar-with-dependencies.jar"`
+# To run the program, run `ray exec java.yaml "java -jar ray-word-count/target/ray-word-count-1.0-SNAPSHOT-jar-with-dependencies.jar -Dray.job.code-search-path=ray-word-count/target"`
@@ -24,7 +24,7 @@ upscaling_speed: 1.0
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "" # e.g., rayproject/ray:latest
+    image: "" # e.g., rayproject/ray-ml:latest
    container_name: "" # e.g. ray_docker
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
@@ -32,9 +32,9 @@ docker:
    run_options: []  # Extra options to pass into "docker run"

    # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:latest-gpu"
+    # head_image: "rayproject/ray-ml:latest-gpu"

-    # worker_image: "rayproject/ray:latest"
+    # worker_image: "rayproject/ray-ml:latest"

 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
@@ -120,7 +120,7 @@ setup_commands:
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    - source activate pytorch_p36 && pip install -U ray
-    - source activate pytorch_p36 && pip install -U ray[rllib] ray[tune] ray[debug]
+    - source activate pytorch_p36 && pip install -U ray[rllib] ray[tune] ray
    # Consider uncommenting these if you also want to run apt-get commands during setup
    # - sudo pkill -9 apt-get || true
    # - sudo pkill -9 dpkg || true
@@ -112,7 +112,7 @@ setup_commands:
    - echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
    # - echo 'conda activate py37_pytorch' >> ~/.bashrc
    - echo 'conda activate py37_tensorflow' >> ~/.bashrc
-    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
    # Consider uncommenting these if you also want to run apt-get commands during setup
    # - sudo pkill -9 apt-get || true
    # - sudo pkill -9 dpkg || true
@@ -19,7 +19,8 @@ upscaling_speed: 1.0
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "rayproject/ray:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
@@ -27,10 +28,10 @@ docker:
    run_options: []  # Extra options to pass into "docker run"

    # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:latest-gpu"
+    # head_image: "rayproject/ray-ml:latest-gpu"
    # Allow Ray to automatically detect GPUs

-    # worker_image: "rayproject/ray:latest-cpu"
+    # worker_image: "rayproject/ray-ml:latest-cpu"
    # worker_run_options: []

 # If a node is idle for this many minutes, it will be removed.
@@ -128,7 +129,7 @@ setup_commands:
    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
    - echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
    - echo 'conda activate py37_tensorflow' >> ~/.bashrc
-    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl

 # Custom commands that will be run on the head node after common setup.
 head_setup_commands:
@@ -19,13 +19,14 @@ upscaling_speed: 1.0
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "rayproject/ray:latest-gpu"
+    image: "rayproject/ray-ml:latest-gpu"
+    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_nvidia_docker" # e.g. ray_docker

    # # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:latest-gpu"
+    # head_image: "rayproject/ray-ml:latest-gpu"

-    # worker_image: "rayproject/ray:latest"
+    # worker_image: "rayproject/ray-ml:latest"

 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
@@ -65,9 +66,9 @@ file_mounts: {
 }

 # List of shell commands to run to set up nodes.
-# NOTE: rayproject/ray:latest has ray latest bundled
+# NOTE: rayproject/ray-ml:latest has ray latest bundled
 setup_commands: []
-#     - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+#     - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
 
 # Custom commands that will be run on the head node after common setup.
 head_setup_commands:
@@ -19,7 +19,8 @@ upscaling_speed: 1.0
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "rayproject/ray:latest-gpu"
+    image: "rayproject/ray-ml:latest-gpu"
+    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_docker"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
@@ -27,9 +28,9 @@ docker:
    run_options: []  # Extra options to pass into "docker run"

    # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:latest-gpu"
+    # head_image: "rayproject/ray-ml:latest-gpu"

-    # worker_image: "rayproject/ray:latest"
+    # worker_image: "rayproject/ray-ml:latest"

 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
@@ -97,7 +98,7 @@ setup_commands:
    - echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
    # - echo 'conda activate py37_pytorch' >> ~/.bashrc
    - echo 'conda activate py37_tensorflow' >> ~/.bashrc
-    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
    # Consider uncommenting these if you also want to run apt-get commands during setup
    # - sudo pkill -9 apt-get || true
    # - sudo pkill -9 dpkg || true
@@ -130,7 +130,7 @@ setup_commands:
      && echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.profile

    # Install ray
-    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl


 # Custom commands that will be run on the head node after common setup.
@@ -19,7 +19,8 @@ upscaling_speed: 1.0
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-  image: "rayproject/ray:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+  image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
  container_name: "ray_container"
  # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
  # if no cached version is present.
@@ -27,10 +28,10 @@ docker:
  run_options: []  # Extra options to pass into "docker run"

  # Example of running a GPU head with CPU workers
-  # head_image: "rayproject/ray:latest-gpu"
+  # head_image: "rayproject/ray-ml:latest-gpu"
  # Allow Ray to automatically detect GPUs

-  # worker_image: "rayproject/ray:latest-cpu"
+  # worker_image: "rayproject/ray-ml:latest-cpu"
  # worker_run_options: []

 # If a node is idle for this many minutes, it will be removed.
@@ -136,7 +137,7 @@ setup_commands: []
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
-    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl


 # Custom commands that will be run on the head node after common setup.
@@ -19,14 +19,15 @@ upscaling_speed: 1.0
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
-    image: "rayproject/ray:latest-gpu"
+    image: "rayproject/ray-ml:latest-gpu"
+    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_nvidia_docker" # e.g. ray_docker

    # # Example of running a GPU head with CPU workers
-    # head_image: "rayproject/ray:latest-gpu"
+    # head_image: "rayproject/ray-ml:latest-gpu"


-    # worker_image: "rayproject/ray:latest"
+    # worker_image: "rayproject/ray-ml:latest"

 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
@@ -117,10 +118,10 @@ initialization_commands:
          done"

 # List of shell commands to run to set up nodes.
-# NOTE: rayproject/ray:latest has ray latest bundled
+# NOTE: rayproject/ray-ml:latest has ray latest bundled
 setup_commands: []
-    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
-    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl

 # Custom commands that will be run on the head node after common setup.
 head_setup_commands:
@@ -142,7 +142,7 @@ head_node:
          #   - rsync (used for `ray rsync` commands and file mounts)
          #   - screen (used for `ray attach`)
          #   - kubectl (used by the autoscaler to manage worker pods)
-          image: rayproject/ray
+          image: rayproject/ray:nightly
          # Do not change this command - it keeps the pod alive until it is
          # explicitly killed.
          command: ["/bin/bash", "-c", "--"]
@@ -215,7 +215,7 @@ worker_nodes:
          # You are free (and encouraged) to use your own container image,
          # but it should have the following installed:
          #   - rsync (used for `ray rsync` commands and file mounts)
-          image: rayproject/ray
+          image: rayproject/ray:nightly
          # Do not change this command - it keeps the pod alive until it is
          # explicitly killed.
          command: ["/bin/bash", "-c", "--"]
@@ -142,7 +142,7 @@ head_node:
          #   - rsync (used for `ray rsync` commands and file mounts)
          #   - screen (used for `ray attach`)
          #   - kubectl (used by the autoscaler to manage worker pods)
-          image: rayproject/ray
+          image: rayproject/ray:nightly
          # Do not change this command - it keeps the pod alive until it is
          # explicitly killed.
          command: ["/bin/bash", "-c", "--"]
@@ -215,7 +215,7 @@ worker_nodes:
          # You are free (and encouraged) to use your own container image,
          # but it should have the following installed:
          #   - rsync (used for `ray rsync` commands and file mounts)
-          image: rayproject/ray
+          image: rayproject/ray:nightly
          # Do not change this command - it keeps the pod alive until it is
          # explicitly killed.
          command: ["/bin/bash", "-c", "--"]
@@ -146,7 +146,7 @@ head_node:
              #   - rsync (used for `ray rsync` commands and file mounts)
              #   - screen (used for `ray attach`)
              #   - kubectl (used by the autoscaler to manage worker pods)
-              image: rayproject/ray
+              image: rayproject/ray:nightly
              # Do not change this command - it keeps the pod alive until it is
              # explicitly killed.
              command: ["/bin/bash", "-c", "--"]
@@ -221,7 +221,7 @@ worker_nodes:
              # You are free (and encouraged) to use your own container image,
              # but it should have the following installed:
              #   - rsync (used for `ray rsync` commands and file mounts)
-              image: rayproject/ray
+              image: rayproject/ray:nightly
              # Do not change this command - it keeps the pod alive until it is
              # explicitly killed.
              command: ["/bin/bash", "-c", "--"]
@@ -0,0 +1,128 @@
+apiVersion: cluster.ray.io/v1
+kind: RayCluster
+metadata:
+  name: example-cluster
+spec:
+  # The maximum number of workers nodes to launch in addition to the head node.
+  maxWorkers: 3
+  # The autoscaler will scale up the cluster faster with higher upscaling speed.
+  # E.g., if the task requires adding more nodes then autoscaler will gradually
+  # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
+  # This number should be > 0.
+  upscalingSpeed: 1.0
+  # If a node is idle for this many minutes, it will be removed.
+  idleTimeoutMinutes: 5
+  # Specify the pod type for the ray head node (as configured below).
+  headPodType: head-node
+  # Specify the default pod type for ray the worker nodes (as configured below).
+  workerDefaultPodType: worker-nodes
+  # Specify the allowed pod types for this ray cluster and the resources they provide.
+  podTypes:
+  - name: head-node
+    podConfig:
+      apiVersion: v1
+      kind: Pod
+      metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: example-cluster-ray-head-
+      spec:
+        restartPolicy: Never
+
+        # This volume allocates shared memory for Ray to use for its plasma
+        # object store. If you do not provide this, Ray will fall back to
+        # /tmp which cause slowdowns if is not a shared memory volume.
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          image: rayproject/ray:nightly
+          # Do not change this command - it keeps the pod alive until it is
+          # explicitly killed.
+          command: ["/bin/bash", "-c", "--"]
+          args: ['trap : TERM INT; sleep infinity & wait;']
+          ports:
+          - containerPort: 6379 # Redis port.
+          - containerPort: 12345 # Ray internal communication.
+          - containerPort: 12346 # Ray internal communication.
+
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          resources:
+            requests:
+              cpu: 1000m
+              memory: 512Mi
+            limits:
+              # The maximum memory that this pod is allowed to use. The
+              # limit will be detected by ray and split to use 10% for
+              # redis, 30% for the shared memory object store, and the
+              # rest for application memory. If this limit is not set and
+              # the object store size is not set manually, ray will
+              # allocate a very large object store in each pod that may
+              # cause problems for other pods.
+              memory: 512Mi
+  - name: worker-nodes
+    # Minimum number of Ray workers of this Pod type.
+    minWorkers: 2
+    # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
+    maxWorkers: 3
+    # User-specified custom resources for use by Ray 
+    rayResources: {"Custom1": 1, "is_spot": 1}
+    # Optional commands to run before starting the Ray runtime.
+    setupCommands: 
+      - pip install numpy # Example
+    podConfig:
+      apiVersion: v1
+      kind: Pod
+      metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: example-cluster-ray-worker-
+      spec:
+        restartPolicy: Never
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          image: rayproject/ray:nightly
+          command: ["/bin/bash", "-c", "--"]
+          args: ["trap : TERM INT; sleep infinity & wait;"]
+          ports:
+          - containerPort: 12345 # Ray internal communication.
+          - containerPort: 12346 # Ray internal communication.
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          resources:
+            requests:
+              cpu: 1000m
+              memory: 512Mi
+            limits:
+              # The maximum memory that this pod is allowed to use. The
+              # limit will be detected by ray and split to use 10% for
+              # redis, 30% for the shared memory object store, and the
+              # rest for application memory. If this limit is not set and
+              # the object store size is not set manually, ray will
+              # allocate a very large object store in each pod that may
+              # cause problems for other pods.
+              memory: 512Mi
+  # Commands to start Ray on the head node. You don't need to change this.
+  # Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
+  headStartRayCommands:
+      - ray stop
+      - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076  --dashboard-host 0.0.0.0
+  # Commands to start Ray on worker nodes. You don't need to change this.
+  workerStartRayCommands:
+      - ray stop
+      - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -0,0 +1,128 @@
+apiVersion: cluster.ray.io/v1
+kind: RayCluster
+metadata:
+  name: example-cluster2
+spec:
+  # The maximum number of workers nodes to launch in addition to the head node.
+  maxWorkers: 3
+  # The autoscaler will scale up the cluster faster with higher upscaling speed.
+  # E.g., if the task requires adding more nodes then autoscaler will gradually
+  # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
+  # This number should be > 0.
+  upscalingSpeed: 1.0
+  # If a node is idle for this many minutes, it will be removed.
+  idleTimeoutMinutes: 5
+  # Specify the pod type for the ray head node (as configured below).
+  headPodType: head-node
+  # Specify the default pod type for ray the worker nodes (as configured below).
+  workerDefaultPodType: worker-nodes
+  # Specify the allowed pod types for this ray cluster and the resources they provide.
+  podTypes:
+  - name: head-node
+    podConfig:
+      apiVersion: v1
+      kind: Pod
+      metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: example-cluster2-ray-head-
+      spec:
+        restartPolicy: Never
+
+        # This volume allocates shared memory for Ray to use for its plasma
+        # object store. If you do not provide this, Ray will fall back to
+        # /tmp which cause slowdowns if is not a shared memory volume.
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          image: rayproject/ray:nightly
+          # Do not change this command - it keeps the pod alive until it is
+          # explicitly killed.
+          command: ["/bin/bash", "-c", "--"]
+          args: ['trap : TERM INT; sleep infinity & wait;']
+          ports:
+          - containerPort: 6379 # Redis port.
+          - containerPort: 12345 # Ray internal communication.
+          - containerPort: 12346 # Ray internal communication.
+
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          resources:
+            requests:
+              cpu: 1000m
+              memory: 512Mi
+            limits:
+              # The maximum memory that this pod is allowed to use. The
+              # limit will be detected by ray and split to use 10% for
+              # redis, 30% for the shared memory object store, and the
+              # rest for application memory. If this limit is not set and
+              # the object store size is not set manually, ray will
+              # allocate a very large object store in each pod that may
+              # cause problems for other pods.
+              memory: 512Mi
+  - name: worker-nodes
+    # Minimum number of Ray workers of this Pod type.
+    minWorkers: 1
+    # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
+    maxWorkers: 3
+    # User-specified custom resources for use by Ray 
+    rayResources: {"Custom1": 1, "is_spot": 1}
+    # Optional commands to run before starting the Ray runtime.
+    setupCommands: 
+      - pip install numpy # Example
+    podConfig:
+      apiVersion: v1
+      kind: Pod
+      metadata:
+        # Automatically generates a name for the pod with this prefix.
+        generateName: example-cluster2-ray-worker-
+      spec:
+        restartPolicy: Never
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        containers:
+        - name: ray-node
+          imagePullPolicy: Always
+          image: rayproject/ray:nightly
+          command: ["/bin/bash", "-c", "--"]
+          args: ["trap : TERM INT; sleep infinity & wait;"]
+          ports:
+          - containerPort: 12345 # Ray internal communication.
+          - containerPort: 12346 # Ray internal communication.
+          # This volume allocates shared memory for Ray to use for its plasma
+          # object store. If you do not provide this, Ray will fall back to
+          # /tmp which cause slowdowns if is not a shared memory volume.
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          resources:
+            requests:
+              cpu: 1000m
+              memory: 512Mi
+            limits:
+              # The maximum memory that this pod is allowed to use. The
+              # limit will be detected by ray and split to use 10% for
+              # redis, 30% for the shared memory object store, and the
+              # rest for application memory. If this limit is not set and
+              # the object store size is not set manually, ray will
+              # allocate a very large object store in each pod that may
+              # cause problems for other pods.
+              memory: 512Mi
+  # Commands to start Ray on the head node. You don't need to change this.
+  # Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
+  headStartRayCommands:
+      - ray stop
+      - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076  --dashboard-host 0.0.0.0
+  # Commands to start Ray on worker nodes. You don't need to change this.
+  workerStartRayCommands:
+      - ray stop
+      - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -9,8 +9,8 @@ apiVersion: rbac.authorization.k8s.io/v1
 metadata:
  name: ray-operator-role
 rules:
- apiGroups: ["", "rbac.authorization.k8s.io"]
-  resources: ["configmaps", "pods", "pods/exec", "services", "serviceaccounts", "roles", "rolebindings"]
+- apiGroups: ["", "cluster.ray.io"]
+  resources: ["rayclusters", "pods", "pods/exec"]
  verbs: ["get", "watch", "list", "create", "delete", "patch"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
@@ -35,8 +35,7 @@ spec:
  - name: ray
    imagePullPolicy: Always
    image: rayproject/ray:nightly
-    command: ["/bin/bash", "-c", "--"]
-    args: ["ray-operator; trap : TERM INT; sleep infinity & wait;"]
+    command: ["ray-operator"]
    env:
    - name: RAY_OPERATOR_POD_NAMESPACE
      valueFrom:
@@ -1,260 +0,0 @@
-# An unique identifier for the head node and workers of this cluster.
-cluster_name: default
-
-# The autoscaler will scale up the cluster to this target fraction of resource
-# usage. For example, if a cluster of 10 nodes is 100% busy and
-# target_utilization is 0.8, it would resize the cluster to 13. This fraction
-# can be decreased to increase the aggressiveness of upscaling.
-# This value must be less than 1.0 for scaling to happen.
-target_utilization_fraction: 0.8
-
-# If a node is idle for this many minutes, it will be removed.
-idle_timeout_minutes: 5
-
-# Kubernetes resources that need to be configured for the autoscaler to be
-# able to manage the Ray cluster. If any of the provided resources don't
-# exist, the autoscaler will attempt to create them. If this fails, you may
-# not have the required permissions and will have to request them to be
-# created by your cluster administrator.
-provider:
-    type: kubernetes
-
-    # Exposing external IP addresses for ray pods isn't currently supported.
-    use_internal_ips: true
-
-    # Namespace to use for all resources created.
-    namespace: ray
-
-    services:
-      # Service that maps to the head node of the Ray cluster.
-      - apiVersion: v1
-        kind: Service
-        metadata:
-            # NOTE: If you're running multiple Ray clusters with services
-            # on one Kubernetes cluster, they must have unique service
-            # names.
-            name: ray-head
-        spec:
-            # This selector must match the head node pod's selector below.
-            selector:
-                component: ray-head
-            ports:
-                - protocol: TCP
-                  port: 8000
-                  targetPort: 8000
-
-      # Service that maps to the worker nodes of the Ray cluster.
-      - apiVersion: v1
-        kind: Service
-        metadata:
-            # NOTE: If you're running multiple Ray clusters with services
-            # on one Kubernetes cluster, they must have unique service
-            # names.
-            name: ray-workers
-        spec:
-            # This selector must match the worker node pods' selector below.
-            selector:
-                component: ray-worker
-            ports:
-                - protocol: TCP
-                  port: 8000
-                  targetPort: 8000
-
-# Kubernetes pod config for the head node pod.
-available_node_types:
-  head_node:
-      resources: {}
-      node_config:
-        apiVersion: v1
-        kind: Pod
-        metadata:
-            # Automatically generates a name for the pod with this prefix.
-            generateName: ray-head-
-
-            # Must match the head node service selector above if a head node
-            # service is required.
-            labels:
-                component: ray-head
-        spec:
-            # Restarting the head node automatically is not currently supported.
-            # If the head node goes down, `ray up` must be run again.
-            restartPolicy: Never
-
-            # This volume allocates shared memory for Ray to use for its plasma
-            # object store. If you do not provide this, Ray will fall back to
-            # /tmp which cause slowdowns if is not a shared memory volume.
-            volumes:
-            - name: dshm
-              emptyDir:
-                  medium: Memory
-
-            containers:
-            - name: ray-node
-              imagePullPolicy: Always
-              # You are free (and encouraged) to use your own container image,
-              # but it should have the following installed:
-              #   - rsync (used for `ray rsync` commands and file mounts)
-              #   - screen (used for `ray attach`)
-              #   - kubectl (used by the autoscaler to manage worker pods)
-              image: rayproject/ray:nightly
-              # Do not change this command - it keeps the pod alive until it is
-              # explicitly killed.
-              command: ["/bin/bash", "-c", "--"]
-              args: ["trap : TERM INT; sleep infinity & wait;"]
-              ports:
-                  - containerPort: 6379 # Redis port.
-                  - containerPort: 6380 # Redis port.
-                  - containerPort: 6381 # Redis port.
-                  - containerPort: 12345 # Ray internal communication.
-                  - containerPort: 12346 # Ray internal communication.
-
-              # This volume allocates shared memory for Ray to use for its plasma
-              # object store. If you do not provide this, Ray will fall back to
-              # /tmp which cause slowdowns if is not a shared memory volume.
-              volumeMounts:
-                  - mountPath: /dev/shm
-                    name: dshm
-              resources:
-                  requests:
-                      cpu: 1000m
-                      memory: 512Mi
-                  limits:
-                      # The maximum memory that this pod is allowed to use. The
-                      # limit will be detected by ray and split to use 10% for
-                      # redis, 30% for the shared memory object store, and the
-                      # rest for application memory. If this limit is not set and
-                      # the object store size is not set manually, ray will
-                      # allocate a very large object store in each pod that may
-                      # cause problems for other pods.
-                      memory: 2Gi
-              env:
-                  # This is used in the head_start_ray_commands below so that
-                  # Ray can spawn the correct number of processes. Omitting this
-                  # may lead to degraded performance.
-                  - name: MY_CPU_REQUEST
-                    valueFrom:
-                        resourceFieldRef:
-                            resource: requests.cpu
-
-  worker_nodes:
-      resources: {}
-      min_workers: 1
-      max_workers: 2
-      node_config:
-        apiVersion: v1
-        kind: Pod
-        metadata:
-            # Automatically generates a name for the pod with this prefix.
-            generateName: ray-worker-
-
-            # Must match the worker node service selector above if a worker node
-            # service is required.
-            labels:
-                component: ray-worker
-        spec:
-            serviceAccountName: default
-
-            # Worker nodes will be managed automatically by the head node, so
-            # do not change the restart policy.
-            restartPolicy: Never
-
-            # This volume allocates shared memory for Ray to use for its plasma
-            # object store. If you do not provide this, Ray will fall back to
-            # /tmp which cause slowdowns if is not a shared memory volume.
-            volumes:
-            - name: dshm
-              emptyDir:
-                  medium: Memory
-
-            containers:
-            - name: ray-node
-              imagePullPolicy: Always
-              # You are free (and encouraged) to use your own container image,
-              # but it should have the following installed:
-              #   - rsync (used for `ray rsync` commands and file mounts)
-              image: rayproject/ray:nightly
-              # Do not change this command - it keeps the pod alive until it is
-              # explicitly killed.
-              command: ["/bin/bash", "-c", "--"]
-              args: ["trap : TERM INT; sleep infinity & wait;"]
-              ports:
-                  - containerPort: 12345 # Ray internal communication.
-                  - containerPort: 12346 # Ray internal communication.
-
-              # This volume allocates shared memory for Ray to use for its plasma
-              # object store. If you do not provide this, Ray will fall back to
-              # /tmp which cause slowdowns if is not a shared memory volume.
-              volumeMounts:
-                  - mountPath: /dev/shm
-                    name: dshm
-              resources:
-                  requests:
-                      cpu: 100m
-                      memory: 512Mi
-                  limits:
-                      # This memory limit will be detected by ray and split into
-                      # 30% for plasma, and 70% for workers.
-                      memory: 2Gi
-              env:
-                  # This is used in the head_start_ray_commands below so that
-                  # Ray can spawn the correct number of processes. Omitting this
-                  # may lead to degraded performance.
-                  - name: MY_CPU_REQUEST
-                    valueFrom:
-                        resourceFieldRef:
-                            resource: requests.cpu
-
-head_node_type:
-  head_node
-
-worker_default_node_type:
-  worker_nodes
-# Files or directories to copy to the head and worker nodes. The format is a
-# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
-file_mounts: {
-}
-
-# Files or directories to copy from the head node to the worker nodes. The format is a
-# list of paths. The same path on the head node will be copied to the worker node.
-# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
-# you should just use file_mounts. Only use this if you know what you're doing!
-cluster_synced_files: []
-
-# Whether changes to directories in file_mounts or cluster_synced_files in the head node
-# should sync to the worker node continuously
-file_mounts_sync_continuously: False
-
-# Patterns for files to exclude when running rsync up or rsync down.
-# This is not supported on kubernetes.
-rsync_exclude: []
-
-# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
-# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
-# as a value, the behavior will match git's behavior for finding and using .gitignore files.
-# This is not supported on kubernetes.
-rsync_filter: []
-
-# List of commands that will be run before `setup_commands`. If docker is
-# enabled, these commands will run outside the container and before docker
-# is setup.
-initialization_commands: []
-
-# List of shell commands to run to set up nodes.
-setup_commands: []
-
-# Custom commands that will be run on the head node after common setup.
-head_setup_commands: []
-
-# Custom commands that will be run on worker nodes after common setup.
-worker_setup_commands: []
-
-# Command to start ray on the head node. You don't need to change this.
-# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
-head_start_ray_commands:
-    - ray stop
-    - ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --object-manager-port=8076 --dashboard-host 0.0.0.0
-
-# Command to start ray on worker nodes. You don't need to change this.
-worker_start_ray_commands:
-    - ray stop
-    - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -25,7 +25,8 @@ idle_timeout_minutes: 5
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled. Assumes Docker is installed.
 docker:
-    image: "rayproject/ray:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
+    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
@@ -93,7 +94,7 @@ setup_commands: []
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
-    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl

 # Custom commands that will be run on the head node after common setup.
 head_setup_commands: []
@@ -20,7 +20,7 @@
    "additionalProperties": false,
 	"properties": {
        "cluster_name": {
-            "description": "An unique identifier for the head node and workers of this cluster.",
+            "description": "A unique identifier for the head node and workers of this cluster.",
            "type": "string"
        },
        "min_workers": {
@@ -3,9 +3,8 @@ from traceback import format_exception

 import colorama

-import ray
 import ray.cloudpickle as pickle
-from ray.core.generated.common_pb2 import RayException, Language
+from ray.core.generated.common_pb2 import RayException, Language, PYTHON
 import setproctitle


@@ -17,7 +16,7 @@ class RayError(Exception):
        exc_info = (type(self), self, self.__traceback__)
        formatted_exception_string = "\n".join(format_exception(*exc_info))
        return RayException(
-            language=ray.Language.PYTHON.value(),
+            language=PYTHON,
            serialized_exception=pickle.dumps(self),
            formatted_exception_string=formatted_exception_string
        ).SerializeToString()
@@ -26,7 +25,7 @@ class RayError(Exception):
    def from_bytes(b):
        ray_exception = RayException()
        ray_exception.ParseFromString(b)
-        if ray_exception.language == ray.Language.PYTHON.value():
+        if ray_exception.language == PYTHON:
            return pickle.loads(ray_exception.serialized_exception)
        else:
            return CrossLanguageError(ray_exception)
@@ -81,6 +80,7 @@ class RayTaskError(RayError):
                 pid=None,
                 ip=None):
        """Initialize a RayTaskError."""
+        import ray
        if proctitle:
            self.proctitle = proctitle
        else:
@@ -7,34 +7,88 @@ import logging

 logger = logging.getLogger(__name__)

-# _client_api has to be external to the API stub, below.
-# Otherwise, ray.remote() that contains ray.remote()
-# contains a reference to the RayAPIStub, therefore a
-# reference to the _client_api, and then tries to pickle
-# the thing.
+# About these global variables: Ray 1.0 uses exported module functions to
+# provide its API, and we need to match that. However, we want different
+# behaviors depending on where, exactly, in the client stack this is running.
+#
+# The reason for these differences depends on what's being pickled and passed
+# to functions, or functions inside functions. So there are three cases to care
+# about
+#
+# (Python Client)-->(Python ClientServer)-->(Internal Raylet Process)
+#
+# * _client_api should be set if we're inside the client
+# * _server_api should be set if we're inside the clientserver
+# * Both will be set if we're running both (as in a test)
+# * Neither should be set if we're inside the raylet (but we still need to shim
+#       from the client API surface to the Ray API)
+#
+# The job of RayAPIStub (below) delegates to the appropriate one of these
+# depending on what's set or not. Then, all users importing the ray object
+# from this package get the stub which routes them to the appropriate APIImpl.
 _client_api: Optional[APIImpl] = None
+_server_api: Optional[APIImpl] = None
+
+# The reason for _is_server is a hack around the above comment while running
+# tests. If we have both a client and a server trying to control these static
+# variables then we need a way to decide which to use. In this case, both
+# _client_api and _server_api are set.
+# This boolean flips between the two
+_is_server: bool = False


@contextmanager
 def stash_api_for_tests(in_test: bool):
-    api = None
+    global _is_server
+    is_server = _is_server
    if in_test:
-        api = stash_api()
-    yield api
+        _is_server = True
+    yield _server_api
    if in_test:
-        restore_api(api)
+        _is_server = is_server


-def stash_api() -> Optional[APIImpl]:
+def _set_client_api(val: Optional[APIImpl]):
    global _client_api
-    a = _client_api
+    global _is_server
+    if _client_api is not None:
+        raise Exception("Trying to set more than one client API")
+    _client_api = val
+    _is_server = False
+
+
+def _set_server_api(val: Optional[APIImpl]):
+    global _server_api
+    global _is_server
+    if _server_api is not None:
+        raise Exception("Trying to set more than one server API")
+    _server_api = val
+    _is_server = True
+
+
+def reset_api():
+    global _client_api
+    global _server_api
+    global _is_server
    _client_api = None
-    return a
+    _server_api = None
+    _is_server = False


-def restore_api(api: Optional[APIImpl]):
+def _get_client_api() -> APIImpl:
    global _client_api
-    _client_api = api
+    global _server_api
+    global _is_server
+    api = None
+    if _is_server:
+        api = _server_api
+    else:
+        api = _client_api
+    if api is None:
+        # We're inside a raylet worker
+        from ray.experimental.client.server.core_ray_api import CoreRayAPI
+        return CoreRayAPI()
+    return api


 class RayAPIStub:
@@ -43,11 +97,10 @@ class RayAPIStub:
                secure: bool = False,
                metadata: List[Tuple[str, str]] = None,
                stub=None):
-        global _client_api
        from ray.experimental.client.worker import Worker
        _client_worker = Worker(
            conn_str, secure=secure, metadata=metadata, stub=stub)
-        _client_api = ClientAPI(_client_worker)
+        _set_client_api(ClientAPI(_client_worker))

    def disconnect(self):
        global _client_api
@@ -56,15 +109,9 @@ class RayAPIStub:
        _client_api = None

    def __getattr__(self, key: str):
-        global _client_api
-        self.__check_client_api()
-        return getattr(_client_api, key)
-
-    def __check_client_api(self):
-        global _client_api
-        if _client_api is None:
-            from ray.experimental.client.server.core_ray_api import CoreRayAPI
-            _client_api = CoreRayAPI()
+        global _get_client_api
+        api = _get_client_api()
+        return getattr(api, key)


 ray = RayAPIStub()
@@ -11,40 +11,145 @@

 from abc import ABC
 from abc import abstractmethod
+from typing import TYPE_CHECKING, Any, Union, Optional
+import ray.core.generated.ray_client_pb2 as ray_client_pb2
+if TYPE_CHECKING:
+    from ray.experimental.client.common import ClientActorHandle
+    from ray.experimental.client.common import ClientStub
+    from ray.experimental.client.common import ClientObjectRef
+    from ray._raylet import ObjectRef
+
+    # Use the imports for type checking.  This is a python 3.6 limitation.
+    # See https://www.python.org/dev/peps/pep-0563/
+    PutType = Union[ClientObjectRef, ObjectRef]


 class APIImpl(ABC):
+    """
+    APIImpl is the interface to implement for whichever version of the core
+    Ray API that needs abstracting when run in client mode.
+    """
+
    @abstractmethod
-    def get(self, *args, **kwargs):
+    def get(self, vals, *, timeout: Optional[float] = None) -> Any:
+        """
+        get is the hook stub passed on to replace `ray.get`
+
+        Args:
+            vals: [Client]ObjectRef or list of these refs to retrieve.
+            timeout: Optional timeout in milliseconds
+        """
        pass

    @abstractmethod
-    def put(self, *args, **kwargs):
+    def put(self, vals: Any, *args,
+            **kwargs) -> Union["ClientObjectRef", "ObjectRef"]:
+        """
+        put is the hook stub passed on to replace `ray.put`
+
+        Args:
+            vals: The value or list of values to `put`.
+            args: opaque arguments
+            kwargs: opaque keyword arguments
+        """
        pass

    @abstractmethod
    def wait(self, *args, **kwargs):
+        """
+        wait is the hook stub passed on to replace `ray.wait`
+
+        Args:
+            args: opaque arguments
+            kwargs: opaque keyword arguments
+        """
        pass

    @abstractmethod
    def remote(self, *args, **kwargs):
+        """
+        remote is the hook stub passed on to replace `ray.remote`.
+
+        This sets up remote functions or actors, as the decorator,
+        but does not execute them.
+
+        Args:
+            args: opaque arguments
+            kwargs: opaque keyword arguments
+        """
        pass

    @abstractmethod
-    def call_remote(self, f, kind, *args, **kwargs):
+    def call_remote(self, instance: "ClientStub", *args, **kwargs):
+        """
+        call_remote is called by stub objects to execute them remotely.
+
+        This is used by stub objects in situations where they're called
+        with .remote, eg, `f.remote()` or `actor_cls.remote()`.
+        This allows the client stub objects to delegate execution to be
+        implemented in the most effective way whether it's in the client,
+        clientserver, or raylet worker.
+
+        Args:
+            instance: The Client-side stub reference to a remote object
+            args: opaque arguments
+            kwargs: opaque keyword arguments
+        """
        pass

    @abstractmethod
-    def close(self, *args, **kwargs):
+    def close(self) -> None:
+        """
+        close cleans up an API connection by closing any channels or
+        shutting down any servers gracefully.
+        """
+        pass
+
+    @abstractmethod
+    def kill(self, actor, *, no_restart=True):
+        """
+        kill forcibly stops an actor running in the cluster
+
+        Args:
+            no_restart: Whether this actor should be restarted if it's a
+              restartable actor.
+        """
+        pass
+
+    @abstractmethod
+    def cancel(self, obj, *, force=False, recursive=True):
+        """
+        Cancels a task on the cluster.
+
+        If the specified task is pending execution, it will not be executed. If
+        the task is currently executing, the behavior depends on the ``force``
+        flag, as per `ray.cancel()`
+
+        Only non-actor tasks can be canceled. Canceled tasks will not be
+        retried (max_retries will not be respected).
+
+        Args:
+            object_ref (ObjectRef): ObjectRef returned by the task
+                that should be canceled.
+            force (boolean): Whether to force-kill a running task by killing
+                the worker that is running the task.
+            recursive (boolean): Whether to try to cancel tasks submitted by
+                the task specified.
+        """
        pass


 class ClientAPI(APIImpl):
+    """
+    The Client-side methods corresponding to the ray API. Delegates
+    to the Client Worker that contains the connection to the ClientServer.
+    """
+
    def __init__(self, worker):
        self.worker = worker

-    def get(self, *args, **kwargs):
-        return self.worker.get(*args, **kwargs)
+    def get(self, vals, *, timeout=None):
+        return self.worker.get(vals, timeout=timeout)

    def put(self, *args, **kwargs):
        return self.worker.put(*args, **kwargs)
@@ -55,12 +160,65 @@ class ClientAPI(APIImpl):
    def remote(self, *args, **kwargs):
        return self.worker.remote(*args, **kwargs)

-    def call_remote(self, f, kind, *args, **kwargs):
-        return self.worker.call_remote(f, kind, *args, **kwargs)
+    def call_remote(self, instance: "ClientStub", *args, **kwargs):
+        return self.worker.call_remote(instance, *args, **kwargs)

-    def close(self, *args, **kwargs):
+    def close(self) -> None:
        return self.worker.close()

+    def kill(self, actor: "ClientActorHandle", *, no_restart=True):
+        return self.worker.terminate_actor(actor, no_restart)
+
+    def cancel(self, obj: "ClientObjectRef", *, force=False, recursive=True):
+        return self.worker.terminate_task(obj, force, recursive)
+
+    # Various metadata methods for the client that are defined in the protocol.
+    def is_initialized(self) -> bool:
+        """ True if our client is connected, and if the server is initialized.
+
+        Returns:
+            A boolean determining if the client is connected and
+            server initialized.
+        """
+        return self.worker.is_initialized()
+
+    def nodes(self):
+        """Get a list of the nodes in the cluster (for debugging only).
+
+        Returns:
+            Information about the Ray clients in the cluster.
+        """
+        return self.worker.get_cluster_info(
+            ray_client_pb2.ClusterInfoType.NODES)
+
+    def cluster_resources(self):
+        """Get the current total cluster resources.
+
+        Note that this information can grow stale as nodes are added to or
+        removed from the cluster.
+
+        Returns:
+            A dictionary mapping resource name to the total quantity of that
+                resource in the cluster.
+        """
+        return self.worker.get_cluster_info(
+            ray_client_pb2.ClusterInfoType.CLUSTER_RESOURCES)
+
+    def available_resources(self):
+        """Get the current available cluster resources.
+
+        This is different from `cluster_resources` in that this will return
+        idle (available) resources rather than total resources.
+
+        Note that this information can grow stale as tasks start and finish.
+
+        Returns:
+            A dictionary mapping resource name to the total quantity of that
+                resource in the cluster.
+        """
+        return self.worker.get_cluster_info(
+            ray_client_pb2.ClusterInfoType.AVAILABLE_RESOURCES)
+
    def __getattr__(self, key: str):
        if not key.startswith("_"):
            raise NotImplementedError(
@@ -1,12 +1,16 @@
 import ray.core.generated.ray_client_pb2 as ray_client_pb2
 from ray.experimental.client import ray
 from typing import Any
+from typing import Dict
 from ray import cloudpickle

+import base64
+

 class ClientBaseRef:
-    def __init__(self, id):
+    def __init__(self, id, handle=None):
        self.id = id
+        self.handle = handle

    def __repr__(self):
        return "%s(%s)" % (
@@ -17,83 +21,243 @@ class ClientBaseRef:
    def __eq__(self, other):
        return self.id == other.id

+    def binary(self):
+        return self.id
+
+    @classmethod
+    def from_remote_ref(cls, ref: ray_client_pb2.RemoteRef):
+        return cls(id=ref.id, handle=ref.handle)
+

 class ClientObjectRef(ClientBaseRef):
-    pass
+    def _unpack_ref(self):
+        return cloudpickle.loads(self.handle)


 class ClientActorRef(ClientBaseRef):
    pass


-class ClientRemoteFunc:
+class ClientStub:
+    pass
+
+
+class ClientRemoteFunc(ClientStub):
+    """
+    A stub created on the Ray Client to represent a remote
+    function that can be exectued on the cluster.
+
+    This class is allowed to be passed around between remote functions.
+
+    Args:
+        _func: The actual function to execute remotely
+        _name: The original name of the function
+        _ref: The ClientObjectRef of the pickled code of the function, _func
+        _raylet_remote: The Raylet-side ray.remote_function.RemoteFunction
+            for this object
+    """
+
    def __init__(self, f):
        self._func = f
        self._name = f.__name__
        self.id = None
-        self._raylet_remote_func = None
+
+        # self._ref can be lazily instantiated. Rather than eagerly creating
+        # function data objects in the server we can put them just before we
+        # execute the function, especially in cases where many @ray.remote
+        # functions exist in a library and only a handful are ever executed by
+        # a user of the library.
+        #
+        # TODO(barakmich): This ref might actually be better as a serialized
+        # ObjectRef. This requires being able to serialize the ref without
+        # pinning it (as the lifetime of the ref is tied with the server, not
+        # the client)
+        self._ref = None
+        self._raylet_remote = None

    def __call__(self, *args, **kwargs):
        raise TypeError(f"Remote function cannot be called directly. "
                        "Use {self._name}.remote method instead")

    def remote(self, *args, **kwargs):
-        return ray.call_remote(self, ray_client_pb2.ClientTask.FUNCTION, *args,
-                               **kwargs)
+        return ray.call_remote(self, *args, **kwargs)
+
+    def _get_ray_remote_impl(self):
+        if self._raylet_remote is None:
+            self._raylet_remote = ray.remote(self._func)
+        return self._raylet_remote

    def __repr__(self):
-        return "ClientRemoteFunc(%s, %s)" % (self._name, self.id)
+        return "ClientRemoteFunc(%s, %s)" % (self._name, self._ref)
+
+    def _prepare_client_task(self) -> ray_client_pb2.ClientTask:
+        if self._ref is None:
+            self._ref = ray.put(self._func)
+        task = ray_client_pb2.ClientTask()
+        task.type = ray_client_pb2.ClientTask.FUNCTION
+        task.name = self._name
+        task.payload_id = self._ref.handle
+        return task


-class ClientActorClass:
+class ClientActorClass(ClientStub):
+    """ A stub created on the Ray Client to represent an actor class.
+
+    It is wrapped by ray.remote and can be executed on the cluster.
+
+    Args:
+        actor_cls: The actual class to execute remotely
+        _name: The original name of the class
+        _ref: The ClientObjectRef of the pickled `actor_cls`
+        _raylet_remote: The Raylet-side ray.ActorClass for this object
+    """
+
    def __init__(self, actor_cls):
        self.actor_cls = actor_cls
        self._name = actor_cls.__name__
+        self._ref = None
+        self._raylet_remote = None

    def __call__(self, *args, **kwargs):
        raise TypeError(f"Remote actor cannot be instantiated directly. "
                        "Use {self._name}.remote() instead")

+    def __getstate__(self) -> Dict:
+        state = {
+            "actor_cls": self.actor_cls,
+            "_name": self._name,
+            "_ref": self._ref,
+        }
+        return state
+
+    def __setstate__(self, state: Dict) -> None:
+        self.actor_cls = state["actor_cls"]
+        self._name = state["_name"]
+        self._ref = state["_ref"]
+
    def remote(self, *args, **kwargs):
        # Actually instantiate the actor
-        ref = ray.call_remote(self, ray_client_pb2.ClientTask.ACTOR, *args,
-                              **kwargs)
-        return ClientActorHandle(ref, self)
+        ref = ray.call_remote(self, *args, **kwargs)
+        return ClientActorHandle(ClientActorRef(ref.id, ref.handle), self)

    def __repr__(self):
-        return "ClientRemoteActor(%s, %s)" % (self._name, self.id)
+        return "ClientRemoteActor(%s, %s)" % (self._name, self._ref)

    def __getattr__(self, key):
+        if key not in self.__dict__:
+            raise AttributeError("Not a class attribute")
        raise NotImplementedError("static methods")

+    def _prepare_client_task(self) -> ray_client_pb2.ClientTask:
+        if self._ref is None:
+            self._ref = ray.put(self.actor_cls)
+        task = ray_client_pb2.ClientTask()
+        task.type = ray_client_pb2.ClientTask.ACTOR
+        task.name = self._name
+        task.payload_id = self._ref.handle
+        return task

-class ClientActorHandle:
-    def __init__(self, actor_id: ClientActorRef,
+
+class ClientActorHandle(ClientStub):
+    """Client-side stub for instantiated actor.
+
+    A stub created on the Ray Client to represent a remote actor that
+    has been started on the cluster.  This class is allowed to be passed
+    around between remote functions.
+
+    Args:
+        actor_ref: A reference to the running actor given to the client. This
+          is a serialized version of the actual handle as an opaque token.
+        actor_class: A reference to the ClientActorClass that this actor was
+          instantiated from.
+        _real_actor_handle: Cached copy of the Raylet-side
+          ray.actor.ActorHandle contained in the actor_id ref.
+    """
+
+    def __init__(self, actor_ref: ClientActorRef,
                 actor_class: ClientActorClass):
-        self.actor_id = actor_id
+        self.actor_ref = actor_ref
        self.actor_class = actor_class
+        self._real_actor_handle = None
+
+    def _get_ray_remote_impl(self):
+        if self._real_actor_handle is None:
+            self._real_actor_handle = cloudpickle.loads(self.actor_ref.handle)
+        return self._real_actor_handle
+
+    def __getstate__(self) -> Dict:
+        state = {
+            "actor_ref": self.actor_ref,
+            "actor_class": self.actor_class,
+            "_real_actor_handle": self._real_actor_handle,
+        }
+        return state
+
+    def __setstate__(self, state: Dict) -> None:
+        self.actor_ref = state["actor_ref"]
+        self.actor_class = state["actor_class"]
+        self._real_actor_handle = state["_real_actor_handle"]
+
+    @property
+    def _actor_id(self):
+        return self.actor_ref.id

    def __getattr__(self, key):
        return ClientRemoteMethod(self, key)

+    def __repr__(self):
+        return "ClientActorHandle(%s)" % (self.actor_ref.id.hex())
+
+
+class ClientRemoteMethod(ClientStub):
+    """A stub for a method on a remote actor.
+
+    Can be annotated with exection options.
+
+    Args:
+        actor_handle: A reference to the ClientActorHandle that generated
+          this method and will have this method called upon it.
+        method_name: The name of this method
+    """

-class ClientRemoteMethod:
    def __init__(self, actor_handle: ClientActorHandle, method_name: str):
        self.actor_handle = actor_handle
        self.method_name = method_name
-        self._name = "%s.%s" % (self.actor_handle.actor_class._name,
-                                self.method_name)

    def __call__(self, *args, **kwargs):
        raise TypeError(f"Remote method cannot be called directly. "
                        "Use {self._name}.remote() instead")

+    def _get_ray_remote_impl(self):
+        return getattr(self.actor_handle._get_ray_remote_impl(),
+                       self.method_name)
+
+    def __getstate__(self) -> Dict:
+        state = {
+            "actor_handle": self.actor_handle,
+            "method_name": self.method_name,
+        }
+        return state
+
+    def __setstate__(self, state: Dict) -> None:
+        self.actor_handle = state["actor_handle"]
+        self.method_name = state["method_name"]
+
    def remote(self, *args, **kwargs):
-        return ray.call_remote(self, ray_client_pb2.ClientTask.METHOD, *args,
-                               **kwargs)
+        return ray.call_remote(self, *args, **kwargs)

    def __repr__(self):
-        return "ClientRemoteMethod(%s, %s)" % (self._name, self.actor_id)
+        name = "%s.%s" % (self.actor_handle.actor_class._name,
+                          self.method_name)
+        return "ClientRemoteMethod(%s, %s)" % (name,
+                                               self.actor_handle.actor_id)
+
+    def _prepare_client_task(self) -> ray_client_pb2.ClientTask:
+        task = ray_client_pb2.ClientTask()
+        task.type = ray_client_pb2.ClientTask.METHOD
+        task.name = self.method_name
+        task.payload_id = self.actor_handle.actor_ref.handle
+        return task


 def convert_from_arg(pb) -> Any:
@@ -114,3 +278,13 @@ def convert_to_arg(val):
        out.local = ray_client_pb2.Arg.Locality.INTERNED
        out.data = cloudpickle.dumps(val)
    return out
+
+
+def encode_exception(exception) -> str:
+    data = cloudpickle.dumps(exception)
+    return base64.standard_b64encode(data).decode()
+
+
+def decode_exception(data) -> Exception:
+    data = base64.standard_b64decode(data)
+    return cloudpickle.loads(data)
@@ -7,18 +7,36 @@
 # While the stub is trivial, it allows us to check that the calls we're
 # making into the core-ray module are contained and well-defined.

+from typing import Any
+from typing import Optional
+from typing import Union
+
 import ray

 from ray.experimental.client.api import APIImpl
-from ray.experimental.client.common import ClientRemoteFunc
+from ray.experimental.client.common import ClientObjectRef
+from ray.experimental.client.common import ClientStub


 class CoreRayAPI(APIImpl):
-    def get(self, *args, **kwargs):
-        return ray.get(*args, **kwargs)
+    """
+    Implements the equivalent client-side Ray API by simply passing along to
+    the Core Ray API. Primarily used inside of Ray Workers as a trampoline back
+    to core ray when passed client stubs.
+    """

-    def put(self, *args, **kwargs):
-        return ray.put(*args, **kwargs)
+    def get(self, vals, *, timeout: Optional[float] = None) -> Any:
+        if isinstance(vals, list):
+            if isinstance(vals[0], ClientObjectRef):
+                return ray.get(
+                    [val._unpack_ref() for val in vals], timeout=timeout)
+        elif isinstance(vals, ClientObjectRef):
+            return ray.get(vals._unpack_ref(), timeout=timeout)
+        return ray.get(vals, timeout=timeout)
+
+    def put(self, vals: Any, *args,
+            **kwargs) -> Union[ClientObjectRef, ray._raylet.ObjectRef]:
+        return ray.put(vals, *args, **kwargs)

    def wait(self, *args, **kwargs):
        return ray.wait(*args, **kwargs)
@@ -26,16 +44,58 @@ class CoreRayAPI(APIImpl):
    def remote(self, *args, **kwargs):
        return ray.remote(*args, **kwargs)

-    def call_remote(self, f: ClientRemoteFunc, kind: int, *args, **kwargs):
-        if f._raylet_remote_func is None:
-            f._raylet_remote_func = ray.remote(f._func)
-        return f._raylet_remote_func.remote(*args, **kwargs)
+    def call_remote(self, instance: ClientStub, *args, **kwargs):
+        return instance._get_ray_remote_impl().remote(*args, **kwargs)

-    def close(self, *args, **kwargs):
+    def close(self) -> None:
        return None

+    def kill(self, actor, *, no_restart=True):
+        return ray.kill(actor, no_restart=no_restart)
+
+    def cancel(self, obj, *, force=False, recursive=True):
+        return ray.cancel(obj, force=force, recursive=recursive)
+
+    def is_initialized(self) -> bool:
+        return ray.is_initialized()
+
    # Allow for generic fallback to ray.* in remote methods. This allows calls
    # like ray.nodes() to be run in remote functions even though the client
    # doesn't currently support them.
    def __getattr__(self, key: str):
        return getattr(ray, key)
+
+
+class RayServerAPI(CoreRayAPI):
+    """
+    Ray Client server-side API shim. By default, simply calls the default Core
+    Ray API calls, but also accepts scheduling calls from functions running
+    inside of other remote functions that need to create more work.
+    """
+
+    def __init__(self, server_instance):
+        self.server = server_instance
+
+    # Wrap single item into list if needed before calling server put.
+    def put(self, vals: Any, *args, **kwargs) -> ClientObjectRef:
+        to_put = []
+        single = False
+        if isinstance(vals, list):
+            to_put = vals
+        else:
+            single = True
+            to_put.append(vals)
+
+        out = [self._put(x) for x in to_put]
+        if single:
+            out = out[0]
+        return out
+
+    def _put(self, val: Any):
+        resp = self.server._put_and_retain_obj(val)
+        return ClientObjectRef(resp.id)
+
+    def call_remote(self, instance: ClientStub, *args, **kwargs):
+        task = instance._prepare_client_task()
+        ticket = self.server.Schedule(task, prepared_args=args)
+        return ClientObjectRef(ticket.return_id)
@@ -3,14 +3,17 @@ from concurrent import futures
 import grpc
 from ray import cloudpickle
 import ray
+import ray.state
 import ray.core.generated.ray_client_pb2 as ray_client_pb2
 import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc
 import time
 import inspect
-from ray.experimental.client import stash_api_for_tests
+import json
+from ray.experimental.client import stash_api_for_tests, _set_server_api
 from ray.experimental.client.common import convert_from_arg
+from ray.experimental.client.common import encode_exception
 from ray.experimental.client.common import ClientObjectRef
-from ray.experimental.client.common import ClientRemoteFunc
+from ray.experimental.client.server.core_ray_api import RayServerAPI

 logger = logging.getLogger(__name__)

@@ -23,31 +26,98 @@ class RayletServicer(ray_client_pb2_grpc.RayletDriverServicer):
        self.registered_actor_classes = {}
        self._test_mode = test_mode

+    def ClusterInfo(self, request,
+                    context=None) -> ray_client_pb2.ClusterInfoResponse:
+        resp = ray_client_pb2.ClusterInfoResponse()
+        resp.type = request.type
+        if request.type == ray_client_pb2.ClusterInfoType.CLUSTER_RESOURCES:
+            resources = ray.cluster_resources()
+            # Normalize resources into floats
+            # (the function may return values that are ints)
+            float_resources = {k: float(v) for k, v in resources.items()}
+            resp.resource_table.CopyFrom(
+                ray_client_pb2.ClusterInfoResponse.ResourceTable(
+                    table=float_resources))
+        elif request.type == \
+                ray_client_pb2.ClusterInfoType.AVAILABLE_RESOURCES:
+            resources = ray.available_resources()
+            # Normalize resources into floats
+            # (the function may return values that are ints)
+            float_resources = {k: float(v) for k, v in resources.items()}
+            resp.resource_table.CopyFrom(
+                ray_client_pb2.ClusterInfoResponse.ResourceTable(
+                    table=float_resources))
+        else:
+            resp.json = self._return_debug_cluster_info(request, context)
+        return resp
+
+    def _return_debug_cluster_info(self, request, context=None) -> str:
+        data = None
+        if request.type == ray_client_pb2.ClusterInfoType.NODES:
+            data = ray.nodes()
+        elif request.type == ray_client_pb2.ClusterInfoType.IS_INITIALIZED:
+            data = ray.is_initialized()
+        else:
+            raise TypeError("Unsupported cluster info type")
+        return json.dumps(data)
+
+    def Terminate(self, request, context=None):
+        if request.WhichOneof("terminate_type") == "task_object":
+            try:
+                object_ref = cloudpickle.loads(request.task_object.handle)
+                ray.cancel(
+                    object_ref,
+                    force=request.task_object.force,
+                    recursive=request.task_object.recursive)
+            except Exception as e:
+                return_exception_in_context(e, context)
+        elif request.WhichOneof("terminate_type") == "actor":
+            try:
+                actor_ref = cloudpickle.loads(request.actor.handle)
+                ray.kill(actor_ref, no_restart=request.actor.no_restart)
+            except Exception as e:
+                return_exception_in_context(e, context)
+        else:
+            raise RuntimeError(
+                "Client requested termination without providing a valid "
+                "terminate_type")
+        return ray_client_pb2.TerminateResponse(ok=True)
+
    def GetObject(self, request, context=None):
-        if request.id not in self.object_refs:
+        request_ref = cloudpickle.loads(request.handle)
+        if request_ref.binary() not in self.object_refs:
            return ray_client_pb2.GetResponse(valid=False)
-        objectref = self.object_refs[request.id]
+        objectref = self.object_refs[request_ref.binary()]
        logger.info("get: %s" % objectref)
-        item = ray.get(objectref)
+        try:
+            item = ray.get(objectref, timeout=request.timeout)
+        except Exception as e:
+            return_exception_in_context(e, context)
        item_ser = cloudpickle.dumps(item)
        return ray_client_pb2.GetResponse(valid=True, data=item_ser)

-    def PutObject(self, request, context=None):
+    def PutObject(self, request, context=None) -> ray_client_pb2.PutResponse:
        obj = cloudpickle.loads(request.data)
+        objectref = self._put_and_retain_obj(obj)
+        pickled_ref = cloudpickle.dumps(objectref)
+        return ray_client_pb2.PutResponse(
+            ref=make_remote_ref(objectref.binary(), pickled_ref))
+
+    def _put_and_retain_obj(self, obj) -> ray.ObjectRef:
        objectref = ray.put(obj)
        self.object_refs[objectref.binary()] = objectref
        logger.info("put: %s" % objectref)
-        return ray_client_pb2.PutResponse(id=objectref.binary())
+        return objectref

    def WaitObject(self, request, context=None) -> ray_client_pb2.WaitResponse:
-        object_refs = [cloudpickle.loads(o) for o in request.object_refs]
+        object_refs = [cloudpickle.loads(o) for o in request.object_handles]
        num_returns = request.num_returns
        timeout = request.timeout
        object_refs_ids = []
        for object_ref in object_refs:
-            if object_ref.id not in self.object_refs:
+            if object_ref.binary() not in self.object_refs:
                return ray_client_pb2.WaitResponse(valid=False)
-            object_refs_ids.append(self.object_refs[object_ref.id])
+            object_refs_ids.append(self.object_refs[object_ref.binary()])
        try:
            ready_object_refs, remaining_object_refs = ray.wait(
                object_refs_ids,
@@ -59,94 +129,133 @@ class RayletServicer(ray_client_pb2_grpc.RayletDriverServicer):
        logger.info("wait: %s %s" % (str(ready_object_refs),
                                     str(remaining_object_refs)))
        ready_object_ids = [
-            ready_object_ref.binary() for ready_object_ref in ready_object_refs
+            make_remote_ref(
+                id=ready_object_ref.binary(),
+                handle=cloudpickle.dumps(ready_object_ref),
+            ) for ready_object_ref in ready_object_refs
        ]
        remaining_object_ids = [
-            remaining_object_ref.binary()
-            for remaining_object_ref in remaining_object_refs
+            make_remote_ref(
+                id=remaining_object_ref.binary(),
+                handle=cloudpickle.dumps(remaining_object_ref),
+            ) for remaining_object_ref in remaining_object_refs
        ]
        return ray_client_pb2.WaitResponse(
            valid=True,
            ready_object_ids=ready_object_ids,
            remaining_object_ids=remaining_object_ids)

-    def Schedule(self, task, context=None) -> ray_client_pb2.ClientTaskTicket:
+    def Schedule(self, task, context=None,
+                 prepared_args=None) -> ray_client_pb2.ClientTaskTicket:
        logger.info("schedule: %s %s" %
                    (task.name,
                     ray_client_pb2.ClientTask.RemoteExecType.Name(task.type)))
        if task.type == ray_client_pb2.ClientTask.FUNCTION:
-            return self._schedule_function(task, context)
+            return self._schedule_function(task, context, prepared_args)
        elif task.type == ray_client_pb2.ClientTask.ACTOR:
-            return self._schedule_actor(task, context)
+            return self._schedule_actor(task, context, prepared_args)
        elif task.type == ray_client_pb2.ClientTask.METHOD:
-            return self._schedule_method(task, context)
+            return self._schedule_method(task, context, prepared_args)
        else:
            raise NotImplementedError(
                "Unimplemented Schedule task type: %s" %
                ray_client_pb2.ClientTask.RemoteExecType.Name(task.type))

-    def _schedule_method(self, task: ray_client_pb2.ClientTask,
-                         context=None) -> ray_client_pb2.ClientTaskTicket:
+    def _schedule_method(
+            self,
+            task: ray_client_pb2.ClientTask,
+            context=None,
+            prepared_args=None) -> ray_client_pb2.ClientTaskTicket:
        actor_handle = self.actor_refs.get(task.payload_id)
        if actor_handle is None:
            raise Exception(
                "Can't run an actor the server doesn't have a handle for")
-        arglist = _convert_args(task.args)
+        arglist = _convert_args(task.args, prepared_args)
        with stash_api_for_tests(self._test_mode):
            output = getattr(actor_handle, task.name).remote(*arglist)
            self.object_refs[output.binary()] = output
-        return ray_client_pb2.ClientTaskTicket(return_id=output.binary())
+            pickled_ref = cloudpickle.dumps(output)
+        return ray_client_pb2.ClientTaskTicket(
+            return_ref=make_remote_ref(output.binary(), pickled_ref))

-    def _schedule_actor(self, task: ray_client_pb2.ClientTask,
-                        context=None) -> ray_client_pb2.ClientTaskTicket:
+    def _schedule_actor(self,
+                        task: ray_client_pb2.ClientTask,
+                        context=None,
+                        prepared_args=None) -> ray_client_pb2.ClientTaskTicket:
        with stash_api_for_tests(self._test_mode):
-            if task.payload_id not in self.registered_actor_classes:
-                actor_class_ref = self.object_refs[task.payload_id]
+            payload_ref = cloudpickle.loads(task.payload_id)
+            if payload_ref.binary() not in self.registered_actor_classes:
+                actor_class_ref = self.object_refs[payload_ref.binary()]
                actor_class = ray.get(actor_class_ref)
                if not inspect.isclass(actor_class):
                    raise Exception("Attempting to schedule actor that "
-                                    "isn't a ClientActorClass.")
+                                    "isn't a class.")
                reg_class = ray.remote(actor_class)
-                self.registered_actor_classes[task.payload_id] = reg_class
-            remote_class = self.registered_actor_classes[task.payload_id]
-            arglist = _convert_args(task.args)
+                self.registered_actor_classes[payload_ref.binary()] = reg_class
+            remote_class = self.registered_actor_classes[payload_ref.binary()]
+            arglist = _convert_args(task.args, prepared_args)
            actor = remote_class.remote(*arglist)
-            actor_ref = actor._actor_id
-            self.actor_refs[actor_ref.binary()] = actor
-        return ray_client_pb2.ClientTaskTicket(return_id=actor_ref.binary())
+            actorhandle = cloudpickle.dumps(actor)
+            self.actor_refs[actorhandle] = actor
+        return ray_client_pb2.ClientTaskTicket(
+            return_ref=make_remote_ref(actor._actor_id.binary(), actorhandle))

-    def _schedule_function(self, task: ray_client_pb2.ClientTask,
-                           context=None) -> ray_client_pb2.ClientTaskTicket:
-        if task.payload_id not in self.function_refs:
-            funcref = self.object_refs[task.payload_id]
+    def _schedule_function(
+            self,
+            task: ray_client_pb2.ClientTask,
+            context=None,
+            prepared_args=None) -> ray_client_pb2.ClientTaskTicket:
+        payload_ref = cloudpickle.loads(task.payload_id)
+        if payload_ref.binary() not in self.function_refs:
+            funcref = self.object_refs[payload_ref.binary()]
            func = ray.get(funcref)
-            if not isinstance(func, ClientRemoteFunc):
+            if not inspect.isfunction(func):
                raise Exception("Attempting to schedule function that "
-                                "isn't a ClientRemoteFunc.")
-            self.function_refs[task.payload_id] = func
-        remote_func = self.function_refs[task.payload_id]
-        arglist = _convert_args(task.args)
+                                "isn't a function.")
+            self.function_refs[payload_ref.binary()] = ray.remote(func)
+        remote_func = self.function_refs[payload_ref.binary()]
+        arglist = _convert_args(task.args, prepared_args)
        # Prepare call if we're in a test
        with stash_api_for_tests(self._test_mode):
            output = remote_func.remote(*arglist)
+            if output.binary() in self.object_refs:
+                raise Exception("already found it")
            self.object_refs[output.binary()] = output
-        return ray_client_pb2.ClientTaskTicket(return_id=output.binary())
+            pickled_output = cloudpickle.dumps(output)
+        return ray_client_pb2.ClientTaskTicket(
+            return_ref=make_remote_ref(output.binary(), pickled_output))


-def _convert_args(arg_list):
+def _convert_args(arg_list, prepared_args=None):
+    if prepared_args is not None:
+        return prepared_args
    out = []
    for arg in arg_list:
        t = convert_from_arg(arg)
        if isinstance(t, ClientObjectRef):
-            out.append(ray.ObjectRef(t.id))
+            out.append(t._unpack_ref())
        else:
            out.append(t)
    return out


+def make_remote_ref(id: bytes, handle: bytes) -> ray_client_pb2.RemoteRef:
+    return ray_client_pb2.RemoteRef(
+        id=id,
+        handle=handle,
+    )
+
+
+def return_exception_in_context(err, context):
+    if context is not None:
+        context.set_details(encode_exception(err))
+        context.set_code(grpc.StatusCode.INTERNAL)
+
+
 def serve(connection_str, test_mode=False):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
    task_servicer = RayletServicer(test_mode=test_mode)
+    _set_server_api(RayServerAPI(task_servicer))
    ray_client_pb2_grpc.add_RayletDriverServicer_to_server(
        task_servicer, server)
    server.add_insecure_port(connection_str)
@@ -3,22 +3,29 @@ It implements the Ray API functions that are forwarded through grpc calls
 to the server.
 """
 import inspect
+import json
+import logging
+from typing import Any
 from typing import List
 from typing import Tuple
+from typing import Optional

 import ray.cloudpickle as cloudpickle
 from ray.util.inspect import is_cython
 import grpc

+from ray.exceptions import TaskCancelledError
 import ray.core.generated.ray_client_pb2 as ray_client_pb2
 import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc
 from ray.experimental.client.common import convert_to_arg
+from ray.experimental.client.common import decode_exception
 from ray.experimental.client.common import ClientObjectRef
-from ray.experimental.client.common import ClientActorRef
 from ray.experimental.client.common import ClientActorClass
-from ray.experimental.client.common import ClientRemoteMethod
+from ray.experimental.client.common import ClientActorHandle
 from ray.experimental.client.common import ClientRemoteFunc

+logger = logging.getLogger(__name__)
+

 class Worker:
    def __init__(self,
@@ -34,6 +41,7 @@ class Worker:
            metadata: additional metadata passed in the grpc request headers.
        """
        self.metadata = metadata
+        self.channel = None
        if stub is None:
            if secure:
                credentials = grpc.ssl_channel_credentials()
@@ -44,28 +52,32 @@ class Worker:
        else:
            self.server = stub

-    def get(self, ids):
+    def get(self, vals, *, timeout: Optional[float] = None) -> Any:
        to_get = []
        single = False
-        if isinstance(ids, list):
-            to_get = [x.id for x in ids]
-        elif isinstance(ids, ClientObjectRef):
-            to_get = [ids.id]
+        if isinstance(vals, list):
+            to_get = [x.handle for x in vals]
+        elif isinstance(vals, ClientObjectRef):
+            to_get = [vals.handle]
            single = True
        else:
            raise Exception("Can't get something that's not a "
-                            "list of IDs or just an ID: %s" % type(ids))
-        out = [self._get(x) for x in to_get]
+                            "list of IDs or just an ID: %s" % type(vals))
+        if timeout is None:
+            timeout = 0
+        out = [self._get(x, timeout) for x in to_get]
        if single:
            out = out[0]
        return out

-    def _get(self, id: bytes):
-        req = ray_client_pb2.GetRequest(id=id)
-        data = self.server.GetObject(req, metadata=self.metadata)
+    def _get(self, handle: bytes, timeout: float):
+        req = ray_client_pb2.GetRequest(handle=handle, timeout=timeout)
+        try:
+            data = self.server.GetObject(req, metadata=self.metadata)
+        except grpc.RpcError as e:
+            raise decode_exception(e.details())
        if not data.valid:
-            raise Exception(
-                "Client GetObject returned invalid data: id invalid?")
+            raise TaskCancelledError(handle)
        return cloudpickle.loads(data.data)

    def put(self, vals):
@@ -86,7 +98,7 @@ class Worker:
        data = cloudpickle.dumps(val)
        req = ray_client_pb2.PutRequest(data=data)
        resp = self.server.PutObject(req, metadata=self.metadata)
-        return ClientObjectRef(resp.id)
+        return ClientObjectRef.from_remote_ref(resp.ref)

    def wait(self,
             object_refs: List[ClientObjectRef],
@@ -98,8 +110,8 @@ class Worker:
        for ref in object_refs:
            assert isinstance(ref, ClientObjectRef)
        data = {
-            "object_refs": [
-                cloudpickle.dumps(object_ref) for object_ref in object_refs
+            "object_handles": [
+                object_ref.handle for object_ref in object_refs
            ],
            "num_returns": num_returns,
            "timeout": timeout if timeout else -1
@@ -110,10 +122,12 @@ class Worker:
            # TODO(ameer): improve error/exceptions messages.
            raise Exception("Client Wait request failed. Reference invalid?")
        client_ready_object_ids = [
-            ClientObjectRef(id) for id in resp.ready_object_ids
+            ClientObjectRef.from_remote_ref(ref)
+            for ref in resp.ready_object_ids
        ]
        client_remaining_object_ids = [
-            ClientObjectRef(id) for id in resp.remaining_object_ids
+            ClientObjectRef.from_remote_ref(ref)
+            for ref in resp.remaining_object_ids
        ]

        return (client_ready_object_ids, client_remaining_object_ids)
@@ -130,50 +144,60 @@ class Worker:
            raise TypeError("The @ray.remote decorator must be applied to "
                            "either a function or to a class.")

-    def call_remote(self, instance, kind, *args, **kwargs):
-        ticket = None
-        if kind == ray_client_pb2.ClientTask.FUNCTION:
-            ticket = self._put_and_schedule(instance, kind, *args, **kwargs)
-        elif kind == ray_client_pb2.ClientTask.ACTOR:
-            ticket = self._put_and_schedule(instance, kind, *args, **kwargs)
-            return ClientActorRef(ticket.return_id)
-        elif kind == ray_client_pb2.ClientTask.METHOD:
-            ticket = self._call_method(instance, *args, **kwargs)
-
-        if ticket is None:
-            raise Exception(
-                "Couldn't call_remote on %s for type %s" % (instance, kind))
-        return ClientObjectRef(ticket.return_id)
-
-    def _call_method(self, instance: ClientRemoteMethod, *args, **kwargs):
-        if not isinstance(instance, ClientRemoteMethod):
-            raise TypeError("Client not passing a ClientRemoteMethod stub")
-        task = ray_client_pb2.ClientTask()
-        task.type = ray_client_pb2.ClientTask.METHOD
-        task.name = instance.method_name
-        task.payload_id = instance.actor_handle.actor_id.id
+    def call_remote(self, instance, *args, **kwargs):
+        task = instance._prepare_client_task()
        for arg in args:
            pb_arg = convert_to_arg(arg)
            task.args.append(pb_arg)
+        logging.debug("Scheduling %s" % task)
        ticket = self.server.Schedule(task, metadata=self.metadata)
-        return ticket
-
-    def _put_and_schedule(self, item, task_type, *args, **kwargs):
-        if isinstance(item, ClientRemoteFunc):
-            ref = self._put(item)
-        elif isinstance(item, ClientActorClass):
-            ref = self._put(item.actor_cls)
-        else:
-            raise TypeError("Client not passing a ClientRemoteFunc stub")
-        task = ray_client_pb2.ClientTask()
-        task.type = task_type
-        task.name = item._name
-        task.payload_id = ref.id
-        for arg in args:
-            pb_arg = convert_to_arg(arg)
-            task.args.append(pb_arg)
-        ticket = self.server.Schedule(task, metadata=self.metadata)
-        return ticket
+        return ClientObjectRef.from_remote_ref(ticket.return_ref)

    def close(self):
-        self.channel.close()
+        self.server = None
+        if self.channel:
+            self.channel.close()
+
+    def terminate_actor(self, actor: ClientActorHandle,
+                        no_restart: bool) -> None:
+        if not isinstance(actor, ClientActorHandle):
+            raise ValueError("ray.kill() only supported for actors. "
+                             "Got: {}.".format(type(actor)))
+        term_actor = ray_client_pb2.TerminateRequest.ActorTerminate()
+        term_actor.handle = actor.actor_ref.handle
+        term_actor.no_restart = no_restart
+        try:
+            term = ray_client_pb2.TerminateRequest(actor=term_actor)
+            self.server.Terminate(term)
+        except grpc.RpcError as e:
+            raise decode_exception(e.details())
+
+    def terminate_task(self, obj: ClientObjectRef, force: bool,
+                       recursive: bool) -> None:
+        if not isinstance(obj, ClientObjectRef):
+            raise TypeError(
+                "ray.cancel() only supported for non-actor object refs. "
+                f"Got: {type(obj)}.")
+        term_object = ray_client_pb2.TerminateRequest.TaskObjectTerminate()
+        term_object.handle = obj.handle
+        term_object.force = force
+        term_object.recursive = recursive
+        try:
+            term = ray_client_pb2.TerminateRequest(task_object=term_object)
+            self.server.Terminate(term)
+        except grpc.RpcError as e:
+            raise decode_exception(e.details())
+
+    def get_cluster_info(self, type: ray_client_pb2.ClusterInfoType.TypeEnum):
+        req = ray_client_pb2.ClusterInfoRequest()
+        req.type = type
+        resp = self.server.ClusterInfo(req)
+        if resp.WhichOneof("response_type") == "resource_table":
+            return resp.resource_table.table
+        return json.loads(resp.json)
+
+    def is_initialized(self) -> bool:
+        if self.server is not None:
+            return self.get_cluster_info(
+                ray_client_pb2.ClusterInfoType.IS_INITIALIZED)
+        return False
@@ -7,8 +7,8 @@ from ray.core.generated.gcs_pb2 import (
    JobConfig,
    ErrorTableData,
    GcsEntry,
-    HeartbeatBatchTableData,
-    HeartbeatTableData,
+    ResourceUsageBatchData,
+    ResourcesData,
    ObjectTableData,
    ProfileTableData,
    TablePrefix,
@@ -33,8 +33,8 @@ __all__ = [
    "ErrorTableData",
    "ErrorType",
    "GcsEntry",
-    "HeartbeatBatchTableData",
-    "HeartbeatTableData",
+    "ResourceUsageBatchData",
+    "ResourcesData",
    "ObjectTableData",
    "ProfileTableData",
    "TablePrefix",
@@ -55,8 +55,8 @@ FUNCTION_PREFIX = "RemoteFunction:"
 LOG_FILE_CHANNEL = "RAY_LOG_CHANNEL"
 REPORTER_CHANNEL = "RAY_REPORTER"

-# xray heartbeats
-XRAY_HEARTBEAT_BATCH_PATTERN = "HEARTBEAT_BATCH:".encode("ascii")
+# xray resource usages
+XRAY_RESOURCES_BATCH_PATTERN = "RESOURCES_BATCH:".encode("ascii")

 # xray job updates
 XRAY_JOB_PATTERN = "JOB:*".encode("ascii")
@@ -23,7 +23,7 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
        c_vector[c_string] GetAllProfileInfo()
        c_vector[c_string] GetAllObjectInfo()
        unique_ptr[c_string] GetObjectInfo(const CObjectID &object_id)
-        unique_ptr[c_string] GetAllHeartbeat()
+        unique_ptr[c_string] GetAllResourceUsage()
        c_vector[c_string] GetAllActorInfo()
        unique_ptr[c_string] GetActorInfo(const CActorID &actor_id)
        c_string GetNodeResourceInfo(const CNodeID &node_id)
@@ -78,11 +78,11 @@ cdef class GlobalStateAccessor:
            return c_string(object_info.get().data(), object_info.get().size())
        return None

-    def get_all_heartbeat(self):
-        """Get newest heartbeat of all nodes from GCS service."""
+    def get_all_resource_usage(self):
+        """Get newest resource usage of all nodes from GCS service."""
        cdef unique_ptr[c_string] result
        with nogil:
-            result = self.inner.get().GetAllHeartbeat()
+            result = self.inner.get().GetAllResourceUsage()
        if result:
            return c_string(result.get().data(), result.get().size())
        return None
@@ -90,7 +90,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
            const CTaskOptions &options, c_vector[CObjectID] *return_ids,
            int max_retries,
            c_pair[CPlacementGroupID, int64_t] placement_options,
-            c_bool placement_group_capture_child_tasks)
+            c_bool placement_group_capture_child_tasks,
+            c_string debugger_breakpoint)
        CRayStatus CreateActor(
            const CRayFunction &function,
            const c_vector[unique_ptr[CTaskArg]] &args,
@@ -101,6 +102,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
            CPlacementGroupID *placement_group_id)
        CRayStatus RemovePlacementGroup(
            const CPlacementGroupID &placement_group_id)
+        CRayStatus WaitPlacementGroupReady(
+            const CPlacementGroupID &placement_group_id, int timeout_ms)
        void SubmitActorTask(
            const CActorID &actor_id, const CRayFunction &function,
            const c_vector[unique_ptr[CTaskArg]] &args,
@@ -222,6 +225,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
            const c_vector[shared_ptr[CRayObject]] &args,
            const c_vector[CObjectID] &arg_reference_ids,
            const c_vector[CObjectID] &return_ids,
+            const c_string debugger_breakpoint,
            c_vector[shared_ptr[CRayObject]] *returns) nogil
         ) task_execution_callback
        (void(const CWorkerID &) nogil) on_worker_shutdown
@@ -15,7 +15,7 @@ cdef extern from "ray/common/ray_config.h" nogil:

        int64_t raylet_heartbeat_timeout_milliseconds() const

-        c_bool light_heartbeat_enabled() const
+        c_bool light_report_resource_usage_enabled() const

        int64_t debug_dump_period_milliseconds() const

@@ -51,10 +51,6 @@ cdef extern from "ray/common/ray_config.h" nogil:

        uint64_t object_manager_default_chunk_size() const

-        int num_workers_per_process_python() const
-
-        int num_workers_per_process_java() const
-
        uint32_t maximum_gcs_deletion_batch_size() const

        int64_t max_direct_call_object_size() const
@@ -68,3 +64,5 @@ cdef extern from "ray/common/ray_config.h" nogil:
        c_bool enable_timeline() const

        c_bool automatic_object_deletion_enabled() const
+
+        uint32_t max_grpc_message_size() const
@@ -14,8 +14,8 @@ cdef class Config:
        return RayConfig.instance().raylet_heartbeat_timeout_milliseconds()

    @staticmethod
-    def light_heartbeat_enabled():
-        return RayConfig.instance().light_heartbeat_enabled()
+    def light_report_resource_usage_enabled():
+        return RayConfig.instance().light_report_resource_usage_enabled()

    @staticmethod
    def debug_dump_period_milliseconds():
@@ -88,14 +88,6 @@ cdef class Config:
    def object_manager_default_chunk_size():
        return RayConfig.instance().object_manager_default_chunk_size()

-    @staticmethod
-    def num_workers_per_process_python():
-        return RayConfig.instance().num_workers_per_process_python()
-
-    @staticmethod
-    def num_workers_per_process_java():
-        return RayConfig.instance().num_workers_per_process_java()
-
    @staticmethod
    def maximum_gcs_deletion_batch_size():
        return RayConfig.instance().maximum_gcs_deletion_batch_size()
@@ -119,3 +111,7 @@ cdef class Config:
    @staticmethod
    def automatic_object_deletion_enabled():
        return RayConfig.instance().automatic_object_deletion_enabled()
+
+    @staticmethod
+    def max_grpc_message_size():
+        return RayConfig.instance().max_grpc_message_size()
@@ -1,7 +1,9 @@
+import ray
 import ray.worker
 from ray import profiling

 __all__ = ["free", "global_gc"]
+MAX_MESSAGE_LENGTH = ray._config.max_grpc_message_size()


 def global_gc():
@@ -22,7 +24,13 @@ def memory_summary():
    raylet = ray.nodes()[0]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    ray.nodes()[0]["NodeManagerPort"])
-    channel = grpc.insecure_channel(raylet_address)
+    channel = grpc.insecure_channel(
+        raylet_address,
+        options=[
+            ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
+            ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
+        ],
+    )
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    reply = stub.FormatGlobalMemoryInfo(
        node_manager_pb2.FormatGlobalMemoryInfoRequest(), timeout=30.0)
@@ -133,7 +133,7 @@ class LogMonitor:
                job_match = JOB_LOG_PATTERN.match(file_path)
                if job_match:
                    job_id = job_match.group(2)
-                    worker_pid = job_match.group(3)
+                    worker_pid = int(job_match.group(3))
                else:
                    job_id = None
                    worker_pid = None
@@ -361,4 +361,5 @@ if __name__ == "__main__":
                   f"failed with the following error:\n{traceback_str}")
        ray.utils.push_error_to_driver_through_redis(
            redis_client, ray_constants.LOG_MONITOR_DIED_ERROR, message)
+        logger.error(message)
        raise e
@@ -91,7 +91,7 @@ class MemoryMonitor:
        if not psutil:
            logger.warn("WARNING: Not monitoring node memory since `psutil` "
                        "is not installed. Install this with "
-                        "`pip install psutil` (or ray[debug]) to enable "
+                        "`pip install psutil` to enable "
                        "debugging of memory-related crashes.")

    def get_memory_usage(self):
@@ -85,7 +85,11 @@ class Monitor:
            This is used to receive notifications about failed components.
    """

-    def __init__(self, redis_address, autoscaling_config, redis_password=None):
+    def __init__(self,
+                 redis_address,
+                 autoscaling_config,
+                 redis_password=None,
+                 prefix_cluster_info=False):
        # Initialize the Redis clients.
        ray.state.state._initialize_global_state(
            redis_address, redis_password=redis_password)
@@ -107,8 +111,10 @@ class Monitor:
        head_node_ip = redis_address.split(":")[0]
        self.load_metrics = LoadMetrics(local_ip=head_node_ip)
        if autoscaling_config:
-            self.autoscaler = StandardAutoscaler(autoscaling_config,
-                                                 self.load_metrics)
+            self.autoscaler = StandardAutoscaler(
+                autoscaling_config,
+                self.load_metrics,
+                prefix_cluster_info=prefix_cluster_info)
            self.autoscaling_config = autoscaling_config
        else:
            self.autoscaler = None
@@ -139,24 +145,24 @@ class Monitor:
        self.primary_subscribe_client.subscribe(channel)

    def update_load_metrics(self):
-        """Fetches heartbeat data from GCS and updates load metrics."""
+        """Fetches resource usage data from GCS and updates load metrics."""

-        all_heartbeat = self.global_state_accessor.get_all_heartbeat()
-        heartbeat_batch_data = \
-            ray.gcs_utils.HeartbeatBatchTableData.FromString(all_heartbeat)
-        for heartbeat_message in heartbeat_batch_data.batch:
-            resource_load = dict(heartbeat_message.resource_load)
-            total_resources = dict(heartbeat_message.resources_total)
-            available_resources = dict(heartbeat_message.resources_available)
+        all_resources = self.global_state_accessor.get_all_resource_usage()
+        resources_batch_data = \
+            ray.gcs_utils.ResourceUsageBatchData.FromString(all_resources)
+        for resource_message in resources_batch_data.batch:
+            resource_load = dict(resource_message.resource_load)
+            total_resources = dict(resource_message.resources_total)
+            available_resources = dict(resource_message.resources_available)

            waiting_bundles, infeasible_bundles = parse_resource_demands(
-                heartbeat_batch_data.resource_load_by_shape)
+                resources_batch_data.resource_load_by_shape)

            pending_placement_groups = list(
-                heartbeat_batch_data.placement_group_load.placement_group_data)
+                resources_batch_data.placement_group_load.placement_group_data)

            # Update the load metrics for this raylet.
-            node_id = ray.utils.binary_to_hex(heartbeat_message.node_id)
+            node_id = ray.utils.binary_to_hex(resource_message.node_id)
            ip = self.raylet_id_to_ip_map.get(node_id)
            if ip:
                self.load_metrics.update(ip, total_resources,
@@ -1,14 +1,14 @@
 linux:
-  "3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
-  "3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
-  "3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
+  "3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
+  "3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+  "3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl

 darwin:
-  "3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp38-cp38-macosx_10_13_x86_64.whl
-  "3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-macosx_10_13_intel.whl
-  "3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-macosx_10_13_intel.whl
+  "3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp38-cp38-macosx_10_13_x86_64.whl
+  "3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-macosx_10_13_intel.whl
+  "3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-macosx_10_13_intel.whl

 win32:
-  "3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp38-cp38-win_amd64.whl
-  "3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-win_amd64.whl
-  "3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp36-cp36m-win_amd64.whl
+  "3.8": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp38-cp38-win_amd64.whl
+  "3.7": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-win_amd64.whl
+  "3.6": https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp36-cp36m-win_amd64.whl
@@ -339,10 +339,6 @@ class Node:
        """Get the cluster Redis password"""
        return self._ray_params.redis_password

-    @property
-    def load_code_from_local(self):
-        return self._ray_params.load_code_from_local
-
    @property
    def object_ref_seed(self):
        """Get the seed for deterministic generation of object refs"""
@@ -723,14 +719,12 @@ class Node:
            stderr_file=stderr_file,
            config=self._config,
            java_worker_options=self._ray_params.java_worker_options,
-            load_code_from_local=self._ray_params.load_code_from_local,
            huge_pages=self._ray_params.huge_pages,
            fate_share=self.kernel_fate_share,
            socket_to_use=self.socket,
            head_node=self.head,
            start_initial_python_workers_for_first_job=self._ray_params.
-            start_initial_python_workers_for_first_job,
-            code_search_path=self._ray_params.code_search_path)
+            start_initial_python_workers_for_first_job)
        assert ray_constants.PROCESS_TYPE_RAYLET not in self.all_processes
        self.all_processes[ray_constants.PROCESS_TYPE_RAYLET] = [process_info]

@@ -739,12 +733,19 @@ class Node:
        raise NotImplementedError

    def start_monitor(self):
-        """Start the monitor."""
+        """Start the monitor.
+
+        Autoscaling output goes to these monitor.err/out files, and
+        any modification to these files may break existing
+        cluster launching commands.
+        """
+        stdout_file, stderr_file = self.get_log_file_handles(
+            "monitor", unique=True)
        process_info = ray._private.services.start_monitor(
            self._redis_address,
            self._logs_dir,
-            stdout_file=subprocess.DEVNULL,
-            stderr_file=subprocess.DEVNULL,
+            stdout_file=stdout_file,
+            stderr_file=stderr_file,
            autoscaling_config=self._ray_params.autoscaling_config,
            redis_password=self._ray_params.redis_password,
            fate_share=self.kernel_fate_share)
@@ -1,108 +0,0 @@
-"""
-Ray operator for Kubernetes.
-
-Reads ray cluster config from a k8s ConfigMap, starts a ray head node pod using
-create_or_update_cluster(), then runs an autoscaling loop in the operator pod
-executing this script. Writes autoscaling logs to the directory
-/root/ray-operator-logs.
-
-In this setup, the ray head node does not run an autoscaler. It is important
-NOT to supply an --autoscaling-config argument to head node's ray start command
-in the cluster config when using this operator.
-
-To run, first create a ConfigMap named ray-operator-configmap from a ray
-cluster config. Then apply the manifest at python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml
-
-For example:
-kubectl create namespace raytest
-kubectl -n raytest create configmap ray-operator-configmap --from-file=python/ray/autoscaler/kubernetes/operator_configs/test_cluster_config.yaml
-kubectl -n raytest apply -f python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml
-""" # noqa
-import os
-from typing import Any, Dict, IO, Tuple
-
-import kubernetes
-import yaml
-
-from ray._private import services
-from ray.autoscaler._private.commands import create_or_update_cluster
-from ray.autoscaler._private.kubernetes import core_api
-from ray.utils import open_log
-from ray import ray_constants
-
-RAY_CLUSTER_NAMESPACE = os.environ.get("RAY_OPERATOR_POD_NAMESPACE")
-RAY_CONFIG_MAP = "ray-operator-configmap"
-RAY_CONFIG_DIR = "/root"
-
-LOG_DIR = "/root/ray-operator-logs"
-ERR_NAME, OUT_NAME = "ray-operator.err", "ray-operator.out"
-
-
-def prepare_ray_cluster_config() -> str:
-    config_map = core_api().read_namespaced_config_map(
-        name=RAY_CONFIG_MAP, namespace=RAY_CLUSTER_NAMESPACE)
-
-    # config_map.data consists of a single key:value pair
-    for config_file_name, config_string in config_map.data.items():
-        config = yaml.safe_load(config_string)
-        config["provider"]["namespace"] = RAY_CLUSTER_NAMESPACE
-        cluster_config_path = os.path.join(RAY_CONFIG_DIR, config_file_name)
-        with open(cluster_config_path, "w") as file:
-            yaml.dump(config, file)
-
-    return cluster_config_path
-
-
-def get_ray_head_pod_ip(config: Dict[str, Any]) -> str:
-    cluster_name = config["cluster_name"]
-    label_selector = f"component=ray-head,ray-cluster-name={cluster_name}"
-    pods = core_api().list_namespaced_pod(
-        namespace=RAY_CLUSTER_NAMESPACE, label_selector=label_selector).items
-    assert (len(pods)) == 1
-    head_pod = pods.pop()
-    return head_pod.status.pod_ip
-
-
-def get_logs() -> Tuple[IO, IO]:
-    try:
-        os.makedirs(LOG_DIR)
-    except OSError:
-        pass
-
-    err_path = os.path.join(LOG_DIR, ERR_NAME)
-    out_path = os.path.join(LOG_DIR, OUT_NAME)
-
-    return open_log(err_path), open_log(out_path)
-
-
-def main():
-    kubernetes.config.load_incluster_config()
-    cluster_config_path = prepare_ray_cluster_config()
-
-    config = create_or_update_cluster(
-        cluster_config_path,
-        override_min_workers=None,
-        override_max_workers=None,
-        no_restart=False,
-        restart_only=False,
-        yes=True,
-        no_config_cache=True)
-    with open(cluster_config_path, "w") as file:
-        yaml.dump(config, file)
-
-    ray_head_pod_ip = get_ray_head_pod_ip(config)
-    # TODO: Add support for user-specified redis port and password
-    redis_address = services.address(ray_head_pod_ip,
-                                     ray_constants.DEFAULT_PORT)
-    stderr_file, stdout_file = get_logs()
-
-    services.start_monitor(
-        redis_address,
-        stdout_file=stdout_file,
-        stderr_file=stderr_file,
-        autoscaling_config=cluster_config_path,
-        redis_password=ray_constants.REDIS_DEFAULT_PASSWORD)
-
-
-if __name__ == "__main__":
-    main()
@@ -0,0 +1,154 @@
+"""
+Ray operator for Kubernetes.
+
+Reads ray cluster config from a k8s ConfigMap, starts a ray head node pod using
+create_or_update_cluster(), then runs an autoscaling loop in the operator pod
+executing this script. Writes autoscaling logs to the directory
+/root/ray-operator-logs.
+
+In this setup, the ray head node does not run an autoscaler. It is important
+NOT to supply an --autoscaling-config argument to head node's ray start command
+in the cluster config when using this operator.
+
+To run, first create a ConfigMap named ray-operator-configmap from a ray
+cluster config. Then apply the manifest at python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml
+
+For example:
+kubectl create namespace raytest
+kubectl -n raytest create configmap ray-operator-configmap --from-file=python/ray/autoscaler/kubernetes/operator_configs/test_cluster_config.yaml
+kubectl -n raytest apply -f python/ray/autoscaler/kubernetes/operator_configs/operator_config.yaml
+""" # noqa
+import logging
+import multiprocessing as mp
+import os
+from typing import Any, Callable, Dict, Optional
+
+from kubernetes.client.exceptions import ApiException
+import yaml
+
+from ray._private import services
+from ray.autoscaler._private import commands
+from ray import monitor
+from ray.operator import operator_utils
+from ray import ray_constants
+
+
+class RayCluster():
+    def __init__(self, config: Dict[str, Any]):
+        self.config = config
+        self.name = self.config["cluster_name"]
+        self.config_path = operator_utils.config_path(self.name)
+
+        self.setup_logging()
+
+        self.subprocess = None  # type: Optional[mp.Process]
+
+    def do_in_subprocess(self,
+                         f: Callable[[], None],
+                         wait_to_finish: bool = False) -> None:
+        # First stop the subprocess if it's alive
+        self.clean_up_subprocess()
+        # Reinstantiate process with f as target and start.
+        self.subprocess = mp.Process(name=self.name, target=f)
+        # Kill subprocess if monitor dies
+        self.subprocess.daemon = True
+        self.subprocess.start()
+        if wait_to_finish:
+            self.subprocess.join()
+
+    def clean_up_subprocess(self):
+        if self.subprocess and self.subprocess.is_alive():
+            self.subprocess.terminate()
+            self.subprocess.join()
+
+    def create_or_update(self) -> None:
+        self.do_in_subprocess(self._create_or_update)
+
+    def _create_or_update(self) -> None:
+        self.start_head()
+        self.start_monitor()
+
+    def start_head(self) -> None:
+        self.write_config()
+        self.config = commands.create_or_update_cluster(
+            self.config_path,
+            override_min_workers=None,
+            override_max_workers=None,
+            no_restart=False,
+            restart_only=False,
+            yes=True,
+            no_config_cache=True)
+        self.write_config()
+
+    def start_monitor(self) -> None:
+        ray_head_pod_ip = commands.get_head_node_ip(self.config_path)
+        # TODO: Add support for user-specified redis port and password
+        redis_address = services.address(ray_head_pod_ip,
+                                         ray_constants.DEFAULT_PORT)
+        self.mtr = monitor.Monitor(
+            redis_address=redis_address,
+            autoscaling_config=self.config_path,
+            redis_password=ray_constants.REDIS_DEFAULT_PASSWORD,
+            prefix_cluster_info=True)
+        self.mtr.run()
+
+    def clean_up(self) -> None:
+        self.clean_up_subprocess()
+        self.clean_up_logging()
+        self.delete_config()
+
+    def setup_logging(self) -> None:
+        self.handler = logging.StreamHandler()
+        self.handler.addFilter(lambda rec: rec.processName == self.name)
+        logging_format = ":".join([self.name, ray_constants.LOGGER_FORMAT])
+        self.handler.setFormatter(logging.Formatter(logging_format))
+        operator_utils.root_logger.addHandler(self.handler)
+
+    def clean_up_logging(self) -> None:
+        operator_utils.root_logger.removeHandler(self.handler)
+
+    def write_config(self) -> None:
+        with open(self.config_path, "w") as file:
+            yaml.dump(self.config, file)
+
+    def delete_config(self) -> None:
+        os.remove(self.config_path)
+
+
+ray_clusters = {}
+
+
+def cluster_action(cluster_config: Dict[str, Any], event_type: str) -> None:
+    cluster_name = cluster_config["cluster_name"]
+    if event_type == "ADDED":
+        ray_clusters[cluster_name] = RayCluster(cluster_config)
+        ray_clusters[cluster_name].create_or_update()
+    elif event_type == "MODIFIED":
+        ray_clusters[cluster_name].create_or_update()
+    elif event_type == "DELETED":
+        ray_clusters[cluster_name].clean_up()
+        del ray_clusters[cluster_name]
+
+
+def main() -> None:
+    # Make directory for ray cluster configs
+    if not os.path.isdir(operator_utils.RAY_CONFIG_DIR):
+        os.mkdir(operator_utils.RAY_CONFIG_DIR)
+    # Control loop
+    cluster_cr_stream = operator_utils.cluster_cr_stream()
+    try:
+        for event in cluster_cr_stream:
+            cluster_cr = event["object"]
+            event_type = event["type"]
+            cluster_config = operator_utils.cr_to_config(cluster_cr)
+            cluster_action(cluster_config, event_type)
+    except ApiException as e:
+        if e.status == 404:
+            raise Exception(
+                "Caught a 404 error. Has the RayCluster CRD been created?")
+        else:
+            raise
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,114 @@
+import copy
+import logging
+import os
+from typing import Any, Dict, Iterator, List
+
+from kubernetes.watch import Watch
+
+from ray.autoscaler._private.kubernetes import custom_objects_api
+
+RAY_NAMESPACE = os.environ.get("RAY_OPERATOR_POD_NAMESPACE")
+
+RAY_CONFIG_DIR = os.path.expanduser("~/ray_cluster_configs")
+CONFIG_SUFFIX = "_config.yaml"
+
+CONFIG_FIELDS = {
+    "maxWorkers": "max_workers",
+    "upscalingSpeed": "upscaling_speed",
+    "idleTimeoutMinutes": "idle_timeout_minutes",
+    "headPodType": "head_node_type",
+    "workerDefaultPodType": "worker_default_node_type",
+    "workerStartRayCommands": "worker_start_ray_commands",
+    "headStartRayCommands": "head_start_ray_commands",
+    "podTypes": "available_node_types"
+}
+
+NODE_TYPE_FIELDS = {
+    "minWorkers": "min_workers",
+    "maxWorkers": "max_workers",
+    "podConfig": "node_config",
+    "rayResources": "resources",
+    "setupCommands": "worker_setup_commands"
+}
+
+PROVIDER_CONFIG = {
+    "type": "kubernetes",
+    "use_internal_ips": True,
+    "namespace": RAY_NAMESPACE
+}
+
+root_logger = logging.getLogger("ray")
+root_logger.setLevel(logging.getLevelName("DEBUG"))
+"""
+ownerReferences:
+  - apiVersion: apps/v1
+    controller: true
+    blockOwnerDeletion: true
+    kind: ReplicaSet
+    name: my-repset
+    uid: d9607e19-f88f-11e6-a518-42010a800195
+"""
+
+
+def config_path(cluster_name: str) -> str:
+    file_name = cluster_name + CONFIG_SUFFIX
+    return os.path.join(RAY_CONFIG_DIR, file_name)
+
+
+def cluster_cr_stream() -> Iterator:
+    w = Watch()
+    return w.stream(
+        custom_objects_api().list_namespaced_custom_object,
+        namespace=RAY_NAMESPACE,
+        group="cluster.ray.io",
+        version="v1",
+        plural="rayclusters")
+
+
+def cr_to_config(cluster_resource: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert RayCluster custom resource to a ray cluster config for use by the
+    autoscaler."""
+    cr_spec = cluster_resource["spec"]
+    cr_meta = cluster_resource["metadata"]
+    config = translate(cr_spec, dictionary=CONFIG_FIELDS)
+    pod_types = cr_spec["podTypes"]
+    config["available_node_types"] = get_node_types(
+        pod_types, cluster_name=cr_meta["name"], cluster_uid=cr_meta["uid"])
+    config["cluster_name"] = cr_meta["name"]
+    config["provider"] = PROVIDER_CONFIG
+    return config
+
+
+def get_node_types(pod_types: List[Dict[str, Any]], cluster_name: str,
+                   cluster_uid: str) -> Dict[str, Any]:
+    cluster_owner_reference = get_cluster_owner_reference(
+        cluster_name, cluster_uid)
+    node_types = {}
+    for pod_type in pod_types:
+        name = pod_type["name"]
+        pod_type_copy = copy.deepcopy(pod_type)
+        pod_type_copy.pop("name")
+        node_types[name] = translate(
+            pod_type_copy, dictionary=NODE_TYPE_FIELDS)
+        # Deleting a RayCluster CR will also delete the associated pods.
+        node_types[name]["node_config"]["metadata"].update({
+            "ownerReferences": [cluster_owner_reference]
+        })
+    return node_types
+
+
+def get_cluster_owner_reference(cluster_name: str,
+                                cluster_uid: str) -> Dict[str, Any]:
+    return {
+        "apiVersion": "apps/v1",
+        "controller": True,
+        "blockOwnerDeletion": True,
+        "kind": "RayCluster",
+        "name": cluster_name,
+        "uid": cluster_uid
+    }
+
+
+def translate(configuration: Dict[str, Any],
+              dictionary: Dict[str, str]) -> Dict[str, Any]:
+    return {dictionary[field]: configuration[field] for field in configuration}
@@ -89,7 +89,6 @@ class RayParams:
            contents to Redis.
        autoscaling_config: path to autoscaling config file.
        java_worker_options (list): The command options for Java worker.
-        load_code_from_local: Whether load code from local file or from GCS.
        metrics_agent_port(int): The port to bind metrics agent.
        metrics_export_port(int): The port at which metrics are exposed
            through a Prometheus endpoint.
@@ -142,14 +141,12 @@ class RayParams:
                 include_log_monitor=None,
                 autoscaling_config=None,
                 java_worker_options=None,
-                 load_code_from_local=False,
                 start_initial_python_workers_for_first_job=False,
                 _system_config=None,
                 enable_object_reconstruction=False,
                 metrics_agent_port=None,
                 metrics_export_port=None,
-                 lru_evict=False,
-                 code_search_path=None):
+                 lru_evict=False):
        self.object_ref_seed = object_ref_seed
        self.redis_address = redis_address
        self.num_cpus = num_cpus
@@ -186,7 +183,6 @@ class RayParams:
        self.include_log_monitor = include_log_monitor
        self.autoscaling_config = autoscaling_config
        self.java_worker_options = java_worker_options
-        self.load_code_from_local = load_code_from_local
        self.metrics_agent_port = metrics_agent_port
        self.metrics_export_port = metrics_export_port
        self.start_initial_python_workers_for_first_job = (
@@ -195,9 +191,6 @@ class RayParams:
        self._lru_evict = lru_evict
        self._enable_object_reconstruction = enable_object_reconstruction
        self._check_usage()
-        self.code_search_path = code_search_path
-        if code_search_path is None:
-            self.code_search_path = []

        # Set the internal config options for LRU eviction.
        if lru_evict:
@@ -197,7 +197,8 @@ LOG_MONITOR_MAX_OPEN_FILES = 200
 # The object metadata field uses the following format: It is a comma
 # separated list of fields. The first field is mandatory and is the
 # type of the object (see types below) or an integer, which is interpreted
-# as an error value.
+# as an error value. The second part is optional and if present has the
+# form DEBUG:<breakpoint_id>, it is used for implementing the debugger.

 # A constant used as object metadata to indicate the object is cross language.
 OBJECT_METADATA_TYPE_CROSS_LANGUAGE = b"XLANG"
@@ -213,6 +214,9 @@ OBJECT_METADATA_TYPE_RAW = b"RAW"
 # of XLANG.
 OBJECT_METADATA_TYPE_ACTOR_HANDLE = b"ACTOR_HANDLE"

+# A constant indicating the debugging part of the metadata (see above).
+OBJECT_METADATA_DEBUG_PREFIX = b"DEBUG:"
+
 AUTOSCALER_RESOURCE_REQUEST_CHANNEL = b"autoscaler_resource_request"

 # The default password to prevent redis port scanning attack.
@@ -153,6 +153,46 @@ class StandardFdRedirectionRotatingFileHandler(RotatingFileHandler):
        os.dup2(self.stream.fileno(), self.get_original_stream().fileno())


+def get_worker_log_file_name(worker_type):
+    job_id = os.environ.get("RAY_JOB_ID")
+    if worker_type == "WORKER":
+        assert job_id is not None, (
+            "RAY_JOB_ID should be set as an env "
+            "variable within default_worker.py. If you see this error, "
+            "please report it to Ray's Github issue.")
+        worker_name = "worker"
+    else:
+        job_id = ray.JobID.nil()
+        worker_name = "io_worker"
+
+    # Make sure these values are set already.
+    assert ray.worker._global_node is not None
+    assert ray.worker.global_worker is not None
+    filename = (f"{worker_name}-"
+                f"{binary_to_hex(ray.worker.global_worker.worker_id)}-"
+                f"{job_id}-{os.getpid()}")
+    return filename
+
+
+def configure_log_file(out_file, err_file):
+    stdout_fileno = sys.stdout.fileno()
+    stderr_fileno = sys.stderr.fileno()
+    # C++ logging requires redirecting the stdout file descriptor. Note that
+    # dup2 will automatically close the old file descriptor before overriding
+    # it.
+    os.dup2(out_file.fileno(), stdout_fileno)
+    os.dup2(err_file.fileno(), stderr_fileno)
+    # We also manually set sys.stdout and sys.stderr because that seems to
+    # have an effect on the output buffering. Without doing this, stdout
+    # and stderr are heavily buffered resulting in seemingly lost logging
+    # statements. We never want to close the stdout file descriptor, dup2 will
+    # close it when necessary and we don't want python's GC to close it.
+    sys.stdout = ray.utils.open_log(
+        stdout_fileno, unbuffered=True, closefd=False)
+    sys.stderr = ray.utils.open_log(
+        stderr_fileno, unbuffered=True, closefd=False)
+
+
 def setup_and_get_worker_interceptor_logger(args,
                                            max_bytes=0,
                                            backup_count=0,
@@ -258,8 +258,12 @@ class RemoteFunction:
                placement_group.id,
                placement_group_bundle_index,
                placement_group_capture_child_tasks,
+                worker.debugger_breakpoint,
                override_environment_variables=override_environment_variables
                or dict())
+            # Reset worker's debug context from the last "remote" command
+            # (which applies only to this .remote call).
+            worker.debugger_breakpoint = b""
            if len(object_refs) == 1:
                return object_refs[0]
            elif len(object_refs) > 1:
@@ -6,6 +6,7 @@ import logging
 import os
 import subprocess
 import sys
+from telnetlib import Telnet
 import time
 import urllib
 import urllib.parse
@@ -150,6 +151,35 @@ def dashboard(cluster_config_file, cluster_name, port, remote_port):
                from None


+def continue_debug_session():
+    """Continue active debugging session.
+
+    This function will connect 'ray debug' to the right debugger
+    when a user is stepping between Ray tasks.
+    """
+    active_sessions = ray.experimental.internal_kv._internal_kv_list(
+        "RAY_PDB_")
+
+    for active_session in active_sessions:
+        if active_session.startswith(b"RAY_PDB_CONTINUE"):
+            print("Continuing pdb session in different process...")
+            key = b"RAY_PDB_" + active_session[len("RAY_PDB_CONTINUE_"):]
+            while True:
+                data = ray.experimental.internal_kv._internal_kv_get(key)
+                if data:
+                    session = json.loads(data)
+                    if "exit_debugger" in session:
+                        ray.experimental.internal_kv._internal_kv_del(key)
+                        return
+                    host, port = session["pdb_address"].split(":")
+                    with Telnet(host, int(port)) as tn:
+                        tn.interact()
+                    ray.experimental.internal_kv._internal_kv_del(key)
+                    continue_debug_session()
+                    return
+                time.sleep(1.0)
+
+
@cli.command()
@click.option(
    "--address",
@@ -158,12 +188,13 @@ def dashboard(cluster_config_file, cluster_name, port, remote_port):
    help="Override the address to connect to.")
 def debug(address):
    """Show all active breakpoints and exceptions in the Ray debugger."""
-    from telnetlib import Telnet
    if not address:
        address = services.get_ray_address_to_use_or_die()
    logger.info(f"Connecting to Ray instance at {address}.")
-    ray.init(address=address)
+    ray.init(address=address, log_to_driver=False)
    while True:
+        continue_debug_session()
+
        active_sessions = ray.experimental.internal_kv._internal_kv_list(
            "RAY_PDB_")
        print("Active breakpoints:")
@@ -358,25 +389,12 @@ def debug(address):
    default=None,
    type=str,
    help="Overwrite the options to start Java workers.")
-@click.option(
-    "--code-search-path",
-    default=None,
-    hidden=True,
-    type=str,
-    help="A list of directories or jar files separated by colon that specify "
-    "the search path for user code. This will be used as `CLASSPATH` in "
-    "Java and `PYTHONPATH` in Python.")
@click.option(
    "--system-config",
    default=None,
    hidden=True,
    type=json.loads,
    help="Override system configuration defaults.")
-@click.option(
-    "--load-code-from-local",
-    is_flag=True,
-    default=False,
-    help="Specify whether load code from local file or GCS serialization.")
@click.option(
    "--lru-evict",
    is_flag=True,
@@ -405,8 +423,7 @@ def start(node_ip_address, address, port, redis_password, redis_shard_ports,
          head, include_dashboard, dashboard_host, dashboard_port, block,
          plasma_directory, autoscaling_config, no_redirect_worker_output,
          no_redirect_output, plasma_store_socket_name, raylet_socket_name,
-          temp_dir, java_worker_options, load_code_from_local,
-          code_search_path, system_config, lru_evict,
+          temp_dir, java_worker_options, system_config, lru_evict,
          enable_object_reconstruction, metrics_export_port, log_style,
          log_color, verbose):
    """Start Ray processes manually on the local machine."""
@@ -465,8 +482,6 @@ def start(node_ip_address, address, port, redis_password, redis_shard_ports,
        dashboard_host=dashboard_host,
        dashboard_port=dashboard_port,
        java_worker_options=java_worker_options,
-        load_code_from_local=load_code_from_local,
-        code_search_path=code_search_path,
        _system_config=system_config,
        lru_evict=lru_evict,
        enable_object_reconstruction=enable_object_reconstruction,
@@ -537,6 +552,8 @@ def start(node_ip_address, address, port, redis_password, redis_shard_ports,
        with cli_logger.group("Next steps"):
            cli_logger.print(
                "To connect to this Ray runtime from another node, run")
+            # NOTE(kfstorm): Java driver rely on this line to get the address
+            # of the cluster. Please be careful when updating this line.
            cli_logger.print(
                cf.bold("  ray start --address='{}'{}"), redis_address,
                f" --redis-password='{redis_password}'"
@@ -632,7 +649,7 @@ def start(node_ip_address, address, port, redis_password, redis_shard_ports,
            cli_logger.print(
                "This command will now block until terminated by a signal.")
            cli_logger.print(
-                "Runing subprocesses are monitored and a message will be "
+                "Running subprocesses are monitored and a message will be "
                "printed if any of them terminate unexpectedly.")

        while True:
@@ -1273,7 +1290,7 @@ def stack():
    COMMAND = """
 pyspy=`which py-spy`
 if [ ! -e "$pyspy" ]; then
-    echo "ERROR: Please 'pip install py-spy' (or ray[debug]) first"
+    echo "ERROR: Please 'pip install py-spy' first"
    exit 1
 fi
 # Set IFS to iterate over lines instead of over words.
@@ -17,6 +17,14 @@ py_test(
    deps = [":serve_lib"],
 )

+py_test(
+    name = "test_controller",
+    size = "small",
+    srcs = serve_tests_srcs,
+    tags = ["exclusive"],
+    deps = [":serve_lib"],
+)
+
 py_test(
    name = "test_backend_worker",
    size = "small",
@@ -35,14 +43,13 @@ py_test(
 )


-# TODO(simon): Test skipped until #11683 fixed.
-# py_test(
-#     name = "test_failure",
-#     size = "medium",
-#     srcs = serve_tests_srcs,
-#     tags = ["exclusive"],
-#     deps = [":serve_lib"],
-# )
+py_test(
+    name = "test_failure",
+    size = "medium",
+    srcs = serve_tests_srcs,
+    tags = ["exclusive"],
+    deps = [":serve_lib"],
+)


 py_test(
@@ -1,6 +1,9 @@
+import asyncio
 import atexit
+import time
 from functools import wraps
 import os
+from uuid import UUID

 import ray
 from ray.serve.constants import (DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT,
@@ -42,6 +45,8 @@ class Client:
        self._controller_name = controller_name
        self._detached = detached
        self._shutdown = False
+        self._http_host, self._http_port = ray.get(
+            controller.get_http_config.remote())

        # NOTE(simon): Used to cache client.get_handle(endpoint) call. It will
        # mostly grow in size, it will only shrink when user calls the
@@ -62,9 +67,9 @@ class Client:

    def __del__(self):
        if not self._detached:
-            logger.info("Shutting down Ray Serve because client went out of "
-                        "scope. To prevent this, either keep a reference to "
-                        "the client object or use serve.start(detached=True).")
+            logger.debug("Shutting down Ray Serve because client went out of "
+                         "scope. To prevent this, either keep a reference to "
+                         "the client or use serve.start(detached=True).")
            self.shutdown()

    def __reduce__(self):
@@ -78,11 +83,34 @@ class Client:
        Shuts down all processes and deletes all state associated with the
        instance.
        """
-        if not self._shutdown:
+        if (not self._shutdown) and ray.is_initialized():
            ray.get(self._controller.shutdown.remote())
            ray.kill(self._controller, no_restart=True)
+
+            # Wait for the named actor entry gets removed as well.
+            started = time.time()
+            while True:
+                try:
+                    ray.get_actor(self._controller_name)
+                    if time.time() - started > 5:
+                        logger.warning(
+                            "Waited 5s for Serve to shutdown gracefully but "
+                            "the controller is still not cleaned up. "
+                            "You can ignore this warning if you are shutting "
+                            "down the Ray cluster.")
+                        break
+                except ValueError:  # actor name is removed
+                    break
+
            self._shutdown = True

+    @_ensure_connected
+    def _get_result(self, result_object_id: ray.ObjectRef) -> bool:
+        result_id: UUID = ray.get(result_object_id)
+        result = ray.get(self._controller.wait_for_event.remote(result_id))
+        logger.debug(f"Getting result_id ({result_id}) with result: {result}")
+        return result
+
    @_ensure_connected
    def create_endpoint(self,
                        endpoint_name: str,
@@ -137,10 +165,33 @@ class Client:
                    "an element of type {}".format(type(method)))
            upper_methods.append(method.upper())

-        ray.get(
+        self._get_result(
            self._controller.create_endpoint.remote(
                endpoint_name, {backend: 1.0}, route, upper_methods))

+        # Block until the route table has been propagated to all HTTP proxies.
+        if route is not None:
+
+            def check_ready(http_response):
+                return route in http_response.json()
+
+            futures = []
+            for node_id in ray.state.node_ids():
+                future = block_until_http_ready.options(
+                    num_cpus=0, resources={
+                        node_id: 0.01
+                    }).remote(
+                        "http://{}:{}/-/routes".format(self._http_host,
+                                                       self._http_port),
+                        check_ready=check_ready,
+                        timeout=HTTP_PROXY_TIMEOUT)
+                futures.append(future)
+            try:
+                ray.get(futures)
+            except ray.exceptions.RayTaskError:
+                raise TimeoutError("Route not available at HTTP proxies "
+                                   "after {HTTP_PROXY_TIMEOUT}s.")
+
    @_ensure_connected
    def delete_endpoint(self, endpoint: str) -> None:
        """Delete the given endpoint.
@@ -149,7 +200,7 @@ class Client:
        """
        if endpoint in self._handle_cache:
            del self._handle_cache[endpoint]
-        ray.get(self._controller.delete_endpoint.remote(endpoint))
+        self._get_result(self._controller.delete_endpoint.remote(endpoint))

    @_ensure_connected
    def list_endpoints(self) -> Dict[str, Dict[str, Any]]:
@@ -193,7 +244,7 @@ class Client:
                "config_options must be a BackendConfig or dictionary.")
        if isinstance(config_options, dict):
            config_options = BackendConfig.parse_obj(config_options)
-        ray.get(
+        self._get_result(
            self._controller.update_backend_config.remote(
                backend_tag, config_options))

@@ -222,7 +273,8 @@ class Client:
        Args:
            backend_tag (str): a unique tag assign to identify this backend.
            func_or_class (callable, class): a function or a class implementing
-                __call__.
+                __call__, returning a JSON-serializable object or a
+                Starlette Response object.
            actor_init_args (optional): the arguments to pass to the class.
                initialization method.
            ray_actor_options (optional): options to be passed into the
@@ -290,7 +342,7 @@ class Client:
            raise TypeError("config must be a BackendConfig or a dictionary.")

        backend_config._validate_complete()
-        ray.get(
+        self._get_result(
            self._controller.create_backend.remote(backend_tag, backend_config,
                                                   replica_config))

@@ -308,7 +360,7 @@ class Client:

        The backend must not currently be used by any endpoints.
        """
-        ray.get(self._controller.delete_backend.remote(backend_tag))
+        self._get_result(self._controller.delete_backend.remote(backend_tag))

    @_ensure_connected
    def set_traffic(self, endpoint_name: str,
@@ -327,7 +379,7 @@ class Client:
            traffic_policy_dictionary (dict): a dictionary maps backend names
                to their traffic weights. The weights must sum to 1.
        """
-        ray.get(
+        self._get_result(
            self._controller.set_traffic.remote(endpoint_name,
                                                traffic_policy_dictionary))

@@ -353,20 +405,24 @@ class Client:
                          (float, int)) or not 0 <= proportion <= 1:
            raise TypeError("proportion must be a float from 0 to 1.")

-        ray.get(
+        self._get_result(
            self._controller.shadow_traffic.remote(endpoint_name, backend_tag,
                                                   proportion))

    @_ensure_connected
    def get_handle(self,
                   endpoint_name: str,
-                   missing_ok: Optional[bool] = False) -> RayServeHandle:
+                   missing_ok: Optional[bool] = False,
+                   sync: bool = True) -> RayServeHandle:
        """Retrieve RayServeHandle for service endpoint to invoke it from Python.

        Args:
            endpoint_name (str): A registered service endpoint.
            missing_ok (bool): If true, then Serve won't check the endpoint is
                registered. False by default.
+            sync (bool): If true, then Serve will return a ServeHandle that
+                works everywhere. Otherwise, Serve will return a ServeHandle
+                that's only usable in asyncio loop.

        Returns:
            RayServeHandle
@@ -375,8 +431,14 @@ class Client:
                self._controller.get_all_endpoints.remote()):
            raise KeyError(f"Endpoint '{endpoint_name}' does not exist.")

+        if asyncio.get_event_loop().is_running() and sync:
+            logger.warning(
+                "You are retrieving a ServeHandle inside an asyncio loop. "
+                "Try getting client.get_handle(.., sync=False) to get better "
+                "performance.")
+
        if endpoint_name not in self._handle_cache:
-            handle = RayServeHandle(self._controller, endpoint_name, sync=True)
+            handle = RayServeHandle(self._controller, endpoint_name, sync=sync)
            self._handle_cache[endpoint_name] = handle
        return self._handle_cache[endpoint_name]

@@ -445,7 +507,11 @@ def start(detached: bool = False,
                    "http://{}:{}/-/routes".format(http_host, http_port),
                    timeout=HTTP_PROXY_TIMEOUT)
            futures.append(future)
-        ray.get(futures)
+        try:
+            ray.get(futures)
+        except ray.exceptions.RayTaskError:
+            raise TimeoutError(
+                "HTTP proxies not available after {HTTP_PROXY_TIMEOUT}s.")

    return Client(controller, controller_name, detached=detached)

@@ -15,10 +15,13 @@ from ray.serve.utils import (parse_request_item, _get_logger, chain_future,
 from ray.serve.exceptions import RayServeException
 from ray.util import metrics
 from ray.serve.config import BackendConfig
-from ray.serve.long_poll import LongPollerAsyncClient
+from ray.serve.long_poll import LongPollAsyncClient
 from ray.serve.router import Query
-from ray.serve.constants import (DEFAULT_LATENCY_BUCKET_MS,
-                                 BACKEND_RECONFIGURE_METHOD)
+from ray.serve.constants import (
+    BACKEND_RECONFIGURE_METHOD,
+    DEFAULT_LATENCY_BUCKET_MS,
+    LongPollKey,
+)
 from ray.exceptions import RayTaskError

 logger = _get_logger()
@@ -168,8 +171,8 @@ class RayServeReplica:
            tag_keys=("backend", ))
        self.request_counter.set_default_tags({"backend": self.backend_tag})

-        self.long_poll_client = LongPollerAsyncClient(controller_handle, {
-            "backend_configs": self._update_backend_configs,
+        self.long_poll_client = LongPollAsyncClient(controller_handle, {
+            LongPollKey.BACKEND_CONFIGS: self._update_backend_configs,
        })

        self.error_counter = metrics.Count(
@@ -1,7 +1,7 @@
 cluster_name: default
-min_workers: 22
-max_workers: 22
-initial_workers: 22
+min_workers: 5
+max_workers: 5
+initial_workers: 5
 autoscaling_mode: default
 docker:
    image: 'anyscale/ray-ml:latest'
@@ -28,6 +28,7 @@ initialization_commands: []
 setup_commands:
    - apt-get install build-essential libssl-dev git -y
    - 'rm -r wrk || true && git clone https://github.com/wg/wrk.git wrk && cd wrk && make -j && cp wrk /usr/local/bin'
+    - ray install-nightly
 head_setup_commands: []
 worker_setup_commands: []
 head_start_ray_commands:
@@ -23,64 +23,88 @@
 # 2 forwarders and 5 worker replicas: 620 requests/s
 # 2 forwarders and 10 worker replicas: 609 requests/s

+import asyncio
+import time
+
 import ray
 from ray import serve
 from ray.serve import BackendConfig
-from ray.serve.utils import logger
-import time

-num_queries = 2000
+num_queries = 10000
+max_concurrent_queries = 100000

 ray.init(address="auto")

-client = serve.start()

-
-def hello_world(_):
+def worker(_):
    return b"Hello World"


 class ForwardActor:
-    def __init__(self):
+    def __init__(self, sync: bool):
        client = serve.connect()
-        self.handle = client.get_handle("hello_world")
+        self.sync = sync
+        self.handle = client.get_handle("worker", sync=sync)

    async def __call__(self, _):
-        await self.handle.remote()
+        if self.sync:
+            await self.handle.remote()
+        else:
+            await (await self.handle.remote_async())


-client.create_backend("hello_world", hello_world)
-client.create_endpoint("hello_world", backend="hello_world")
+async def run_test(num_replicas, num_forwarders, sync):
+    client = serve.start()
+    client.create_backend(
+        "worker",
+        worker,
+        config=BackendConfig(
+            num_replicas=num_replicas,
+            max_concurrent_queries=max_concurrent_queries,
+        ))
+    client.create_endpoint("worker", backend="worker")
+    endpoint_name = "worker"

-client.create_backend("ForwardActor", ForwardActor)
-client.create_endpoint("ForwardActor", backend="ForwardActor")
+    if num_forwarders > 0:
+        client.create_backend(
+            "ForwardActor",
+            ForwardActor,
+            sync,
+            config=BackendConfig(
+                num_replicas=num_forwarders,
+                max_concurrent_queries=max_concurrent_queries))
+        client.create_endpoint("ForwardActor", backend="ForwardActor")
+        endpoint_name = "ForwardActor"

-
-def run_test(num_replicas, num_forwarders):
-    replicas_config = BackendConfig(num_replicas=num_replicas)
-    client.update_backend_config("hello_world", replicas_config)
-
-    if (num_forwarders == 0):
-        handle = client.get_handle("hello_world")
-    else:
-        forwarders_config = BackendConfig(num_replicas=num_forwarders)
-        client.update_backend_config("ForwardActor", forwarders_config)
-        handle = client.get_handle("ForwardActor")
+    handle = client.get_handle(endpoint_name, sync=sync)

    # warmup - helpful to wait for gc.collect() and actors to start
    start = time.time()
    while time.time() - start < 1:
-        ray.get(handle.remote())
+        if sync:
+            ray.get(handle.remote())
+        else:
+            ray.get(await handle.remote_async())

    # real test
    start = time.time()
-    ray.get([handle.remote() for _ in range(num_queries)])
+    if sync:
+        ray.get([handle.remote() for _ in range(num_queries)])
+    else:
+        ray.get([(await handle.remote_async()) for _ in range(num_queries)])
    qps = num_queries / (time.time() - start)

-    logger.info("{} forwarders and {} worker replicas: {} requests/s".format(
-        num_forwarders, num_replicas, int(qps)))
+    print(
+        f"Sync: {sync}, {num_forwarders} forwarders and {num_replicas} worker "
+        f"replicas: {int(qps)} requests/s")
+    client.shutdown()


-for num_forwarders in [0, 1, 2]:
-    for num_replicas in [1, 5, 10]:
-        run_test(num_replicas, num_forwarders)
+async def main():
+    for sync in [True, False]:
+        for num_forwarders in [0, 1, 2]:
+            for num_replicas in [1, 5, 10]:
+                await run_test(num_replicas, num_forwarders, sync)
+
+
+asyncio.get_event_loop().run_until_complete(main())
@@ -86,13 +86,14 @@ async def main():
    client.create_backend("backend", backend)
    client.create_endpoint("endpoint", backend="backend", route="/api")
    for intermediate_handles in [False, True]:
-        if (intermediate_handles):
+        if intermediate_handles:

            client.create_endpoint(
                "backend", backend="backend", route="/backend")

            class forwardActor:
                def __init__(self):
+                    client = serve.connect()
                    self.handle = client.get_handle("backend")

                def __call__(self, _):
@@ -36,73 +36,76 @@ from ray import serve
 from ray.serve import BackendConfig
 from ray.serve.utils import logger

-from ray.util.placement_group import (placement_group, remove_placement_group)
+from ray.util.placement_group import placement_group, remove_placement_group

 ray.shutdown()
 ray.init(address="auto")
-client = serve.start()

-# These numbers need to correspond with the autoscaler config file.
-# The number of remote nodes in the autoscaler should upper bound
-# these because sometimes nodes fail to update.
-num_workers = 20
-expected_num_nodes = num_workers + 1
-cpus_per_node = 4
-num_remote_cpus = expected_num_nodes * cpus_per_node
+# We ask for more worker but only need to run on smaller subset.
+# This should account for worker nodes failed to launch.
+expected_num_nodes = 6
+num_replicas = 11
+# wrk HTTP load testing config
+num_connections = 20
+num_threads = 2
+time_to_run = "20s"

 # Wait until the expected number of nodes have joined the cluster.
 while True:
-    num_nodes = len(ray.nodes())
+    num_nodes = len(list(filter(lambda node: node["Alive"], ray.nodes())))
    logger.info("Waiting for nodes {}/{}".format(num_nodes,
                                                 expected_num_nodes))
    if num_nodes >= expected_num_nodes:
        break
    time.sleep(5)
+
 logger.info("Nodes have all joined. There are %s resources.",
            ray.cluster_resources())

+client = serve.start()
+

 def hey(_):
    time.sleep(0.01)  # Sleep for 10ms
    return b"hey"


-num_connections = int(num_remote_cpus * 0.75)
-num_threads = 2
-time_to_run = "10s"
-
 pg = placement_group(
    [{
        "CPU": 1
    } for _ in range(expected_num_nodes)], strategy="STRICT_SPREAD")
 ray.get(pg.ready())

-# The number of replicas is the number of cores remaining after accounting
-# for the one HTTP proxy actor on each node, the "hey" requester task on each
-# node, and the serve controller.
-# num_replicas = expected_num_nodes * (cpus_per_node - 2) - 1
-num_replicas = ray.available_resources()["CPU"]
 logger.info("Starting %i replicas", num_replicas)
 client.create_backend(
    "hey", hey, config=BackendConfig(num_replicas=num_replicas))
 client.create_endpoint("hey", backend="hey", route="/hey")


-@ray.remote
+@ray.remote(num_cpus=0)
 def run_wrk():
-    logger.info("Warming up for ~3 seconds")
-    for _ in range(5):
-        resp = requests.get("http://127.0.0.1:8000/hey").text
-        logger.info("Received response \'" + resp + "\'")
-        time.sleep(0.5)
+    logger.info("Warming up")
+    for _ in range(10):
+        try:
+            resp = requests.get("http://127.0.0.1:8000/hey").text
+            logger.info("Received response '" + resp + "'")
+            time.sleep(0.5)
+        except Exception as e:
+            logger.info(f"Got exception {e}")

    result = subprocess.run(
        [
-            "wrk", "-c",
-            str(num_connections), "-t",
-            str(num_threads), "-d", time_to_run, "http://127.0.0.1:8000/hey"
+            "wrk",
+            "-c",
+            str(num_connections),
+            "-t",
+            str(num_threads),
+            "-d",
+            time_to_run,
+            "http://127.0.0.1:8000/hey",
        ],
-        stdout=subprocess.PIPE)
+        stdout=subprocess.PIPE,
+    )
    return result.stdout.decode()


@@ -23,6 +23,7 @@ initialization_commands: []
 setup_commands:
    - apt-get install build-essential libssl-dev git -y
    - 'rm -r wrk || true && git clone https://github.com/wg/wrk.git wrk && cd wrk && make -j && cp wrk /usr/local/bin'
+    - ray install-nightly
 head_setup_commands: []
 worker_setup_commands: []
 head_start_ray_commands:
@@ -1,3 +1,5 @@
+from enum import auto, Enum
+
 #: Actor name used to register controller
 SERVE_CONTROLLER_NAME = "SERVE_CONTROLLER_ACTOR"

@@ -37,3 +39,13 @@ DEFAULT_LATENCY_BUCKET_MS = [

 #: Name of backend reconfiguration method implemented by user.
 BACKEND_RECONFIGURE_METHOD = "reconfigure"
+
+
+class LongPollKey(Enum):
+    def __repr__(self):
+        return f"{self.__class__.__name__}.{self.name}"
+
+    REPLICA_HANDLES = auto()
+    TRAFFIC_POLICIES = auto()
+    BACKEND_CONFIGS = auto()
+    ROUTE_TABLE = auto()
@@ -6,20 +6,22 @@ import random
 import time
 from dataclasses import dataclass, field
 from typing import Dict, Any, List, Optional, Tuple
+from uuid import uuid4, UUID
 from pydantic import BaseModel

 import ray
 import ray.cloudpickle as pickle
 from ray.serve.autoscaling_policy import BasicAutoscalingPolicy
 from ray.serve.backend_worker import create_backend_replica
-from ray.serve.constants import ASYNC_CONCURRENCY, SERVE_PROXY_NAME
+from ray.serve.constants import (ASYNC_CONCURRENCY, SERVE_PROXY_NAME,
+                                 LongPollKey)
 from ray.serve.http_proxy import HTTPProxyActor
 from ray.serve.kv_store import RayInternalKVStore
 from ray.serve.exceptions import RayServeException
 from ray.serve.utils import (format_actor_name, get_random_letters, logger,
                             try_schedule_resources_on_nodes, get_all_node_ids)
 from ray.serve.config import BackendConfig, ReplicaConfig
-from ray.serve.long_poll import LongPollerHost
+from ray.serve.long_poll import LongPollHost
 from ray.actor import ActorHandle

 import numpy as np
@@ -144,7 +146,7 @@ class ActorStateReconciler:
    controller_name: str = field(init=True)
    detached: bool = field(init=True)

-    routers_cache: Dict[NodeId, ActorHandle] = field(default_factory=dict)
+    http_proxy_cache: Dict[NodeId, ActorHandle] = field(default_factory=dict)
    backend_replicas: Dict[BackendTag, Dict[ReplicaTag, ActorHandle]] = field(
        default_factory=lambda: defaultdict(dict))
    backend_replicas_to_start: Dict[BackendTag, List[ReplicaTag]] = field(
@@ -156,8 +158,8 @@ class ActorStateReconciler:

    # TODO(edoakes): consider removing this and just using the names.

-    def router_handles(self) -> List[ActorHandle]:
-        return list(self.routers_cache.values())
+    def http_proxy_handles(self) -> List[ActorHandle]:
+        return list(self.http_proxy_cache.values())

    def get_replica_handles(self) -> List[ActorHandle]:
        return list(
@@ -302,7 +304,7 @@ class ActorStateReconciler:
    async def _stop_pending_backend_replicas(self) -> None:
        """Stops the pending backend replicas in self.backend_replicas_to_stop.

-        Removes backend_replicas from the router, kills them, and clears
+        Removes backend_replicas from the http_proxy, kills them, and clears
        self.backend_replicas_to_stop.
        """
        for backend_tag, replicas_list in self.backend_replicas_to_stop.items(
@@ -326,26 +328,26 @@ class ActorStateReconciler:

        self.backend_replicas_to_stop.clear()

-    def _start_routers_if_needed(self, http_host: str, http_port: str,
-                                 http_middlewares: List[Any]) -> None:
-        """Start a router on every node if it doesn't already exist."""
+    def _start_http_proxies_if_needed(self, http_host: str, http_port: str,
+                                      http_middlewares: List[Any]) -> None:
+        """Start an HTTP proxy on every node if it doesn't already exist."""
        if http_host is None:
            return

        for node_id, node_resource in get_all_node_ids():
-            if node_id in self.routers_cache:
+            if node_id in self.http_proxy_cache:
                continue

-            router_name = format_actor_name(SERVE_PROXY_NAME,
-                                            self.controller_name, node_id)
+            name = format_actor_name(SERVE_PROXY_NAME, self.controller_name,
+                                     node_id)
            try:
-                router = ray.get_actor(router_name)
+                proxy = ray.get_actor(name)
            except ValueError:
-                logger.info("Starting router with name '{}' on node '{}' "
+                logger.info("Starting HTTP proxy with name '{}' on node '{}' "
                            "listening on '{}:{}'".format(
-                                router_name, node_id, http_host, http_port))
-                router = HTTPProxyActor.options(
-                    name=router_name,
+                                name, node_id, http_host, http_port))
+                proxy = HTTPProxyActor.options(
+                    name=name,
                    lifetime="detached" if self.detached else None,
                    max_concurrency=ASYNC_CONCURRENCY,
                    max_restarts=-1,
@@ -359,10 +361,10 @@ class ActorStateReconciler:
                    controller_name=self.controller_name,
                    http_middlewares=http_middlewares)

-            self.routers_cache[node_id] = router
+            self.http_proxy_cache[node_id] = proxy

-    def _stop_routers_if_needed(self) -> bool:
-        """Removes router actors from any nodes that no longer exist.
+    def _stop_http_proxies_if_needed(self) -> bool:
+        """Removes HTTP proxy actors from any nodes that no longer exist.

        Returns whether or not any actors were removed (a checkpoint should
        be taken).
@@ -370,25 +372,25 @@ class ActorStateReconciler:
        actor_stopped = False
        all_node_ids = {node_id for node_id, _ in get_all_node_ids()}
        to_stop = []
-        for node_id in self.routers_cache:
+        for node_id in self.http_proxy_cache:
            if node_id not in all_node_ids:
-                logger.info(
-                    "Removing router on removed node '{}'.".format(node_id))
+                logger.info("Removing HTTP proxy on removed node '{}'.".format(
+                    node_id))
                to_stop.append(node_id)

        for node_id in to_stop:
-            router_handle = self.routers_cache.pop(node_id)
-            ray.kill(router_handle, no_restart=True)
+            proxy = self.http_proxy_cache.pop(node_id)
+            ray.kill(proxy, no_restart=True)
            actor_stopped = True

        return actor_stopped

    def _recover_actor_handles(self) -> None:
        # Refresh the RouterCache
-        for node_id in self.routers_cache.keys():
-            router_name = format_actor_name(SERVE_PROXY_NAME,
-                                            self.controller_name, node_id)
-            self.routers_cache[node_id] = ray.get_actor(router_name)
+        for node_id in self.http_proxy_cache.keys():
+            name = format_actor_name(SERVE_PROXY_NAME, self.controller_name,
+                                     node_id)
+            self.http_proxy_cache[node_id] = ray.get_actor(name)

        # Fetch actor handles for all of the backend replicas in the system.
        # All of these backend_replicas are guaranteed to already exist because
@@ -420,12 +422,19 @@ class ActorStateReconciler:
        return autoscaling_policies


+@dataclass
+class FutureResult:
+    # Goal requested when this future was created
+    requested_goal: Dict[str, Any]
+
+
@dataclass
 class Checkpoint:
    goal_state: SystemState
    current_state: SystemState
    reconciler: ActorStateReconciler
    # TODO(ilr) Rename reconciler to PendingState
+    inflight_reqs: Dict[uuid4, FutureResult]


@ray.remote
@@ -474,7 +483,7 @@ class ServeController:
        # backend -> AutoscalingPolicy
        self.autoscaling_policies = dict()

-        # Dictionary of backend_tag -> router_name -> most recent queue length.
+        # Dictionary of backend_tag -> proxy_name -> most recent queue length.
        self.backend_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
@@ -487,56 +496,87 @@ class ServeController:

        # If starting the actor for the first time, starts up the other system
        # components. If recovering, fetches their actor handles.
-        self.actor_reconciler._start_routers_if_needed(
+        self.actor_reconciler._start_http_proxies_if_needed(
            self.http_host, self.http_port, self.http_middlewares)

-        # NOTE(edoakes): unfortunately, we can't completely recover from a
-        # checkpoint in the constructor because we block while waiting for
-        # other actors to start up, and those actors fetch soft state from
-        # this actor. Because no other tasks will start executing until after
-        # the constructor finishes, if we were to run this logic in the
-        # constructor it could lead to deadlock between this actor and a child.
-        # However we do need to guarantee that we have fully recovered from a
-        # checkpoint before any other state-changing calls run. We address this
-        # by acquiring the write_lock and then posting the task to recover from
-        # a checkpoint to the event loop. Other state-changing calls acquire
-        # this lock and will be blocked until recovering from the checkpoint
-        # finishes.
+        # Map of awaiting results
+        # TODO(ilr): Checkpoint this once this becomes asynchronous
+        self.inflight_results: Dict[UUID, asyncio.Event] = dict()
+        self._serializable_inflight_results: Dict[UUID, FutureResult] = dict()
+
        checkpoint = self.kv_store.get(CHECKPOINT_KEY)
        if checkpoint is None:
            logger.debug("No checkpoint found")
        else:
-            await self.write_lock.acquire()
-            asyncio.get_event_loop().create_task(
-                self._recover_from_checkpoint(checkpoint))
+            await self._recover_from_checkpoint(checkpoint)

        # NOTE(simon): Currently we do all-to-all broadcast. This means
        # any listeners will receive notification for all changes. This
        # can be problem at scale, e.g. updating a single backend config
        # will send over the entire configs. In the future, we should
        # optimize the logic to support subscription by key.
-        self.long_poll_host = LongPollerHost()
+        self.long_poll_host = LongPollHost()
+
+        # The configs pushed out here get updated by
+        # self._recover_from_checkpoint in the failure scenario, so that must
+        # be run before we notify the changes.
        self.notify_backend_configs_changed()
        self.notify_replica_handles_changed()
        self.notify_traffic_policies_changed()
+        self.notify_route_table_changed()

        asyncio.get_event_loop().create_task(self.run_control_loop())

+    async def wait_for_event(self, uuid: UUID) -> bool:
+        if uuid not in self.inflight_results:
+            return True
+        event = self.inflight_results[uuid]
+        await event.wait()
+        self.inflight_results.pop(uuid)
+        self._serializable_inflight_results.pop(uuid)
+        async with self.write_lock:
+            self._checkpoint()
+
+        return True
+
+    def _create_event_with_result(
+            self,
+            goal_state: Dict[str, any],
+            recreation_uuid: Optional[UUID] = None) -> UUID:
+        # NOTE(ilr) Must be called before checkpointing!
+        event = asyncio.Event()
+        event.result = FutureResult(goal_state)
+        event.set()
+        uuid_val = recreation_uuid or uuid4()
+        self.inflight_results[uuid_val] = event
+        self._serializable_inflight_results[uuid_val] = event.result
+        return uuid_val
+
+    async def _num_inflight_results(self) -> int:
+        return len(self.inflight_results)
+
    def notify_replica_handles_changed(self):
        self.long_poll_host.notify_changed(
-            "worker_handles", {
+            LongPollKey.REPLICA_HANDLES, {
                backend_tag: list(replica_dict.values())
                for backend_tag, replica_dict in
                self.actor_reconciler.backend_replicas.items()
            })

    def notify_traffic_policies_changed(self):
-        self.long_poll_host.notify_changed("traffic_policies",
-                                           self.current_state.traffic_policies)
+        self.long_poll_host.notify_changed(
+            LongPollKey.TRAFFIC_POLICIES,
+            self.current_state.traffic_policies,
+        )

    def notify_backend_configs_changed(self):
        self.long_poll_host.notify_changed(
-            "backend_configs", self.current_state.get_backend_configs())
+            LongPollKey.BACKEND_CONFIGS,
+            self.current_state.get_backend_configs())
+
+    def notify_route_table_changed(self):
+        self.long_poll_host.notify_changed(LongPollKey.ROUTE_TABLE,
+                                           self.current_state.routes)

    async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]):
        """Proxy long pull client's listen request.
@@ -549,13 +589,9 @@ class ServeController:
        return await (
            self.long_poll_host.listen_for_change(keys_to_snapshot_ids))

-    def get_routers(self) -> Dict[str, ActorHandle]:
-        """Returns a dictionary of node ID to router actor handles."""
-        return self.actor_reconciler.routers_cache
-
-    def get_router_config(self) -> Dict[str, Tuple[str, List[str]]]:
-        """Called by the router on startup to fetch required state."""
-        return self.current_state.routes
+    def get_http_proxies(self) -> Dict[str, ActorHandle]:
+        """Returns a dictionary of node ID to http_proxy actor handles."""
+        return self.actor_reconciler.http_proxy_cache

    def _checkpoint(self) -> None:
        """Checkpoint internal state and write it to the KV store."""
@@ -565,7 +601,8 @@ class ServeController:

        checkpoint = pickle.dumps(
            Checkpoint(self.goal_state, self.current_state,
-                       self.actor_reconciler))
+                       self.actor_reconciler,
+                       self._serializable_inflight_results))

        self.kv_store.put(CHECKPOINT_KEY, checkpoint)
        logger.debug("Wrote checkpoint in {:.2f}".format(time.time() - start))
@@ -578,35 +615,51 @@ class ServeController:
    async def _recover_from_checkpoint(self, checkpoint_bytes: bytes) -> None:
        """Recover the instance state from the provided checkpoint.

+        This should be called in the constructor to ensure that the internal
+        state is updated before any other operations run. After running this,
+        internal state will be updated and long-poll clients may be notified.
+
        Performs the following operations:
            1) Deserializes the internal state from the checkpoint.
-            2) Pushes the latest configuration to the routers
-               in case we crashed before updating them.
-            3) Starts/stops any replicas that are pending creation or
+            2) Starts/stops any replicas that are pending creation or
               deletion.
-
-        NOTE: this requires that self.write_lock is already acquired and will
-        release it before returning.
        """
-        assert self.write_lock.locked()
-
        start = time.time()
        logger.info("Recovering from checkpoint")

        restored_checkpoint: Checkpoint = pickle.loads(checkpoint_bytes)
-        # Restore SystemState
        self.current_state = restored_checkpoint.current_state

-        # Restore ActorStateReconciler
        self.actor_reconciler = restored_checkpoint.reconciler

-        self.autoscaling_policies = await self.actor_reconciler.\
-            _recover_from_checkpoint(self.current_state, self)
+        self._serializable_inflight_results = restored_checkpoint.inflight_reqs
+        for uuid, fut_result in self._serializable_inflight_results.items():
+            self._create_event_with_result(fut_result.requested_goal, uuid)

-        logger.info(
-            "Recovered from checkpoint in {:.3f}s".format(time.time() - start))
+        # NOTE(edoakes): unfortunately, we can't completely recover from a
+        # checkpoint in the constructor because we block while waiting for
+        # other actors to start up, and those actors fetch soft state from
+        # this actor. Because no other tasks will start executing until after
+        # the constructor finishes, if we were to run this logic in the
+        # constructor it could lead to deadlock between this actor and a child.
+        # However, we do need to guarantee that we have fully recovered from a
+        # checkpoint before any other state-changing calls run. We address this
+        # by acquiring the write_lock and then posting the task to recover from
+        # a checkpoint to the event loop. Other state-changing calls acquire
+        # this lock and will be blocked until recovering from the checkpoint
+        # finishes. This can be removed once we move to the async control loop.

-        self.write_lock.release()
+        async def finish_recover_from_checkpoint():
+            assert self.write_lock.locked()
+            self.autoscaling_policies = await self.actor_reconciler.\
+                _recover_from_checkpoint(self.current_state, self)
+            self.write_lock.release()
+            logger.info(
+                "Recovered from checkpoint in {:.3f}s".format(time.time() -
+                                                              start))
+
+        await self.write_lock.acquire()
+        asyncio.get_event_loop().create_task(finish_recover_from_checkpoint())

    async def do_autoscale(self) -> None:
        for backend, info in self.current_state.backends.items():
@@ -623,44 +676,30 @@ class ServeController:
        while True:
            await self.do_autoscale()
            async with self.write_lock:
-                self.actor_reconciler._start_routers_if_needed(
+                self.actor_reconciler._start_http_proxies_if_needed(
                    self.http_host, self.http_port, self.http_middlewares)
                checkpoint_required = self.actor_reconciler.\
-                    _stop_routers_if_needed()
+                    _stop_http_proxies_if_needed()
                if checkpoint_required:
                    self._checkpoint()

            await asyncio.sleep(CONTROL_LOOP_PERIOD_S)

-    def get_backend_configs(self) -> Dict[str, BackendConfig]:
-        """Fetched by the router on startup."""
-        return self.current_state.get_backend_configs()
-
-    def get_traffic_policies(self) -> Dict[str, TrafficPolicy]:
-        """Fetched by the router on startup."""
-        return self.current_state.traffic_policies
-
-    def _list_replicas(self, backend_tag: BackendTag) -> List[ReplicaTag]:
-        """Used only for testing."""
-        return list(self.actor_reconciler.backend_replicas[backend_tag].keys())
-
-    def get_traffic_policy(self, endpoint: str) -> TrafficPolicy:
-        """Fetched by serve handles."""
-        return self.current_state.traffic_policies[endpoint]
-
-    def get_all_replica_handles(self) -> Dict[str, Dict[str, ActorHandle]]:
-        """Fetched by the router on startup."""
+    def _all_replica_handles(
+            self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]:
+        """Used for testing."""
        return self.actor_reconciler.backend_replicas

-    def get_all_backends(self) -> Dict[str, BackendConfig]:
+    def get_all_backends(self) -> Dict[BackendTag, BackendConfig]:
        """Returns a dictionary of backend tag to backend config."""
        return self.current_state.get_backend_configs()

-    def get_all_endpoints(self) -> Dict[str, Dict[str, Any]]:
+    def get_all_endpoints(self) -> Dict[EndpointTag, Dict[BackendTag, Any]]:
+        """Returns a dictionary of backend tag to backend config."""
        return self.current_state.get_endpoints()

    async def _set_traffic(self, endpoint_name: str,
-                           traffic_dict: Dict[str, float]) -> None:
+                           traffic_dict: Dict[str, float]) -> UUID:
        if endpoint_name not in self.current_state.get_endpoints():
            raise ValueError("Attempted to assign traffic for an endpoint '{}'"
                             " that is not registered.".format(endpoint_name))
@@ -677,21 +716,25 @@ class ServeController:
        traffic_policy = TrafficPolicy(traffic_dict)
        self.current_state.traffic_policies[endpoint_name] = traffic_policy

+        return_uuid = self._create_event_with_result({
+            endpoint_name: traffic_policy
+        })
        # NOTE(edoakes): we must write a checkpoint before pushing the
        # update to avoid inconsistent state if we crash after pushing the
        # update.
        self._checkpoint()
-
        self.notify_traffic_policies_changed()
+        return return_uuid

    async def set_traffic(self, endpoint_name: str,
-                          traffic_dict: Dict[str, float]) -> None:
+                          traffic_dict: Dict[str, float]) -> UUID:
        """Sets the traffic policy for the specified endpoint."""
        async with self.write_lock:
-            await self._set_traffic(endpoint_name, traffic_dict)
+            return_uuid = await self._set_traffic(endpoint_name, traffic_dict)
+        return return_uuid

    async def shadow_traffic(self, endpoint_name: str, backend_tag: BackendTag,
-                             proportion: float) -> None:
+                             proportion: float) -> UUID:
        """Shadow traffic from the endpoint to the backend."""
        async with self.write_lock:
            if endpoint_name not in self.current_state.get_endpoints():
@@ -707,16 +750,22 @@ class ServeController:
            self.current_state.traffic_policies[endpoint_name].set_shadow(
                backend_tag, proportion)

+            traffic_policy = self.current_state.traffic_policies[endpoint_name]
+
+            return_uuid = self._create_event_with_result({
+                endpoint_name: traffic_policy
+            })
            # NOTE(edoakes): we must write a checkpoint before pushing the
            # update to avoid inconsistent state if we crash after pushing the
            # update.
            self._checkpoint()
            self.notify_traffic_policies_changed()
+            return return_uuid

    # TODO(architkulkarni): add Optional for route after cloudpickle upgrade
    async def create_endpoint(self, endpoint: str,
                              traffic_dict: Dict[str, float], route,
-                              methods) -> None:
+                              methods) -> UUID:
        """Create a new endpoint with the specified route and methods.

        If the route is None, this is a "headless" endpoint that will not
@@ -755,13 +804,11 @@ class ServeController:
            self.current_state.routes[route] = (endpoint, methods)

            # NOTE(edoakes): checkpoint is written in self._set_traffic.
-            await self._set_traffic(endpoint, traffic_dict)
-            await asyncio.gather(*[
-                router.set_route_table.remote(self.current_state.routes)
-                for router in self.actor_reconciler.router_handles()
-            ])
+            return_uuid = await self._set_traffic(endpoint, traffic_dict)
+            self.notify_route_table_changed()
+            return return_uuid

-    async def delete_endpoint(self, endpoint: str) -> None:
+    async def delete_endpoint(self, endpoint: str) -> UUID:
        """Delete the specified endpoint.

        Does not modify any corresponding backends.
@@ -788,19 +835,20 @@ class ServeController:

            self.actor_reconciler.endpoints_to_remove.append(endpoint)

+            return_uuid = self._create_event_with_result({
+                route_to_delete: None,
+                endpoint: None
+            })
            # NOTE(edoakes): we must write a checkpoint before pushing the
-            # updates to the routers to avoid inconsistent state if we crash
+            # updates to the proxies to avoid inconsistent state if we crash
            # after pushing the update.
            self._checkpoint()
-
-            await asyncio.gather(*[
-                router.set_route_table.remote(self.current_state.routes)
-                for router in self.actor_reconciler.router_handles()
-            ])
+            self.notify_route_table_changed()
+            return return_uuid

    async def create_backend(self, backend_tag: BackendTag,
                             backend_config: BackendConfig,
-                             replica_config: ReplicaConfig) -> None:
+                             replica_config: ReplicaConfig) -> UUID:
        """Register a new backend under the specified tag."""
        async with self.write_lock:
            # Ensures this method is idempotent.
@@ -815,12 +863,11 @@ class ServeController:

            # Save creator that starts replicas, the arguments to be passed in,
            # and the configuration for the backends.
-            self.current_state.add_backend(
-                backend_tag,
-                BackendInfo(
-                    worker_class=backend_replica,
-                    backend_config=backend_config,
-                    replica_config=replica_config))
+            backend_info = BackendInfo(
+                worker_class=backend_replica,
+                backend_config=backend_config,
+                replica_config=replica_config)
+            self.current_state.add_backend(backend_tag, backend_info)
            metadata = backend_config.internal_metadata
            if metadata.autoscaling_config is not None:
                self.autoscaling_policies[
@@ -835,6 +882,9 @@ class ServeController:
                del self.current_state.backends[backend_tag]
                raise e

+            return_uuid = self._create_event_with_result({
+                backend_tag: backend_info
+            })
            # NOTE(edoakes): we must write a checkpoint before starting new
            # or pushing the updated config to avoid inconsistent state if we
            # crash while making the change.
@@ -844,11 +894,12 @@ class ServeController:

            self.notify_replica_handles_changed()

-            # Set the backend config inside the router
+            # Set the backend config inside routers
            # (particularly for max_concurrent_queries).
            self.notify_backend_configs_changed()
+            return return_uuid

-    async def delete_backend(self, backend_tag: BackendTag) -> None:
+    async def delete_backend(self, backend_tag: BackendTag) -> UUID:
        async with self.write_lock:
            # This method must be idempotent. We should validate that the
            # specified backend exists on the client.
@@ -876,19 +927,21 @@ class ServeController:
            if backend_tag in self.autoscaling_policies:
                del self.autoscaling_policies[backend_tag]

-            # Add the intention to remove the backend from the router.
+            # Add the intention to remove the backend from the routers.
            self.actor_reconciler.backends_to_remove.append(backend_tag)

+            return_uuid = self._create_event_with_result({backend_tag: None})
            # NOTE(edoakes): we must write a checkpoint before removing the
-            # backend from the router to avoid inconsistent state if we crash
+            # backend from the routers to avoid inconsistent state if we crash
            # after pushing the update.
            self._checkpoint()
            await self.actor_reconciler._stop_pending_backend_replicas()

            self.notify_replica_handles_changed()
+            return return_uuid

    async def update_backend_config(self, backend_tag: BackendTag,
-                                    config_options: BackendConfig) -> None:
+                                    config_options: BackendConfig) -> UUID:
        """Set the config for the specified backend."""
        async with self.write_lock:
            assert (self.current_state.get_backend(backend_tag)
@@ -902,18 +955,22 @@ class ServeController:
            backend_config._validate_complete()
            self.current_state.get_backend(
                backend_tag).backend_config = backend_config
+            backend_info = self.current_state.get_backend(backend_tag)

            # Scale the replicas with the new configuration.
            self.actor_reconciler._scale_backend_replicas(
                self.current_state.backends, backend_tag,
                backend_config.num_replicas)

+            return_uuid = self._create_event_with_result({
+                backend_tag: backend_info
+            })
            # NOTE(edoakes): we must write a checkpoint before pushing the
            # update to avoid inconsistent state if we crash after pushing the
            # update.
            self._checkpoint()

-            # Inform the router about change in configuration
+            # Inform the routers about change in configuration
            # (particularly for setting max_batch_size).

            await self.actor_reconciler._start_pending_backend_replicas(
@@ -922,6 +979,7 @@ class ServeController:

            self.notify_replica_handles_changed()
            self.notify_backend_configs_changed()
+            return return_uuid

    def get_backend_config(self, backend_tag: BackendTag) -> BackendConfig:
        """Get the current config for the specified backend."""
@@ -929,11 +987,15 @@ class ServeController:
                ), "Backend {} is not registered.".format(backend_tag)
        return self.current_state.get_backend(backend_tag).backend_config

+    def get_http_config(self):
+        """Return the HTTP proxy configuration."""
+        return self.http_host, self.http_port
+
    async def shutdown(self) -> None:
        """Shuts down the serve instance completely."""
        async with self.write_lock:
-            for router in self.actor_reconciler.router_handles():
-                ray.kill(router, no_restart=True)
+            for http_proxy in self.actor_reconciler.http_proxy_handles():
+                ray.kill(http_proxy, no_restart=True)
            for replica in self.actor_reconciler.get_replica_handles():
                ray.kill(replica, no_restart=True)
            self.kv_store.delete(CHECKPOINT_KEY)
@@ -89,5 +89,6 @@ class RandomEndpointPolicy(EndpointPolicy):
                query.metadata.shard_key.encode("utf-8"))

        chosen_backend, shadow_backends = self._select_backends(value)
-        logger.debug(f"Chosen backend {chosen_backend} for query {query}")
+        logger.debug(f"Assigning query {query.metadata.request_id} "
+                     f"to backend {chosen_backend}.")
        return [chosen_backend] + shadow_backends
@@ -7,6 +7,7 @@ import ray
 from ray.serve.context import TaskContext
 from ray.serve.router import RequestMetadata, Router
 from ray.serve.utils import get_random_letters
+from ray.serve.exceptions import RayServeException

 global_async_loop = None

@@ -109,16 +110,25 @@ class RayServeHandle:
            ``**kwargs``: All keyword arguments will be available in
                ``request.args``.
        """
-        assert self.sync, "handle.remote() should be called from sync handle."
+        if not self.sync:
+            raise RayServeException(
+                "You are trying to call handle.remote() with async handle. "
+                "Please use `await handle.remote_async()` instead.")
+
        coro = self._remote(request_data, kwargs)
        future: concurrent.futures.Future = asyncio.run_coroutine_threadsafe(
            coro, self.async_loop)
+
        # Block until the result is ready.
        return future.result()

-    async def _remote_async(self, request_data, **kwargs) -> ray.ObjectRef:
+    async def remote_async(self,
+                           request_data: Optional[Union[Dict, Any]] = None,
+                           **kwargs) -> ray.ObjectRef:
        """Experimental API for enqueue a request in async context."""
-        assert not self.sync, "_remote_async must be called inside async loop."
+        if not asyncio.get_event_loop().is_running():
+            raise RayServeException(
+                "remote_async must be called from a running event loop.")
        return await self._remote(request_data, kwargs)

    def options(self,
@@ -3,46 +3,46 @@ import socket
 from typing import List

 import uvicorn
+import starlette.responses

 import ray
 from ray.exceptions import RayTaskError
+from ray.serve.constants import LongPollKey
 from ray.serve.context import TaskContext
 from ray.util import metrics
 from ray.serve.utils import _get_logger, get_random_letters
 from ray.serve.http_util import Response
+from ray.serve.long_poll import LongPollAsyncClient
 from ray.serve.router import Router, RequestMetadata

-# The maximum number of times to retry a request due to actor failure.
-# TODO(edoakes): this should probably be configurable.
-MAX_ACTOR_DEAD_RETRIES = 10
-
 logger = _get_logger()


 class HTTPProxy:
-    """
-    This class should be instantiated and ran by ASGI server.
+    """This class is meant to be instantiated and run by an ASGI HTTP server.

    >>> import uvicorn
    >>> uvicorn.run(HTTPProxy(kv_store_actor_handle, router_handle))
-    # blocks forever
    """

-    async def fetch_config_from_controller(self, controller_name):
-        assert ray.is_initialized()
+    def __init__(self, controller_name):
        controller = ray.get_actor(controller_name)
-
-        self.route_table = await controller.get_router_config.remote()
+        self.route_table = {}  # Should be updated via long polling.
+        self.router = Router(controller)
+        self.long_poll_client = LongPollAsyncClient(controller, {
+            LongPollKey.ROUTE_TABLE: self._update_route_table,
+        })

        self.request_counter = metrics.Count(
            "num_http_requests",
            description="The number of HTTP requests processed",
            tag_keys=("route", ))

-        self.router = Router(controller)
+    async def setup(self):
        await self.router.setup_in_async_loop()

-    def set_route_table(self, route_table):
+    async def _update_route_table(self, route_table):
+        logger.debug(f"HTTP Proxy: Get updated route table: {route_table}.")
        self.route_table = route_table

    async def receive_http_body(self, scope, receive, send):
@@ -74,8 +74,11 @@ class HTTPProxy:
                status_code=404).send(scope, receive, send)

    async def __call__(self, scope, receive, send):
-        # NOTE: This implements ASGI protocol specified in
-        #       https://asgi.readthedocs.io/en/latest/specs/index.html
+        """Implements the ASGI protocol.
+
+        See details at:
+            https://asgi.readthedocs.io/en/latest/specs/index.html.
+        """

        error_sender = self._make_error_sender(scope, receive, send)

@@ -126,6 +129,18 @@ class HTTPProxy:
        if isinstance(result, RayTaskError):
            error_message = "Task Error. Traceback: {}.".format(result)
            await error_sender(error_message, 500)
+        elif isinstance(result, starlette.responses.Response):
+            if isinstance(result, starlette.responses.StreamingResponse):
+                raise TypeError("Starlette StreamingResponse returned by "
+                                f"backend for endpoint {endpoint_name}. "
+                                "StreamingResponse is unserializable and not "
+                                "supported by Ray Serve.  Consider using "
+                                "another Starlette response type such as "
+                                "Response, HTMLResponse, PlainTextResponse, "
+                                "or JSONResponse.  If support for "
+                                "StreamingResponse is desired, please let "
+                                "the Ray team know by making a Github issue!")
+            await result(scope, receive, send)
        else:
            await Response(result).send(scope, receive, send)

@@ -137,12 +152,13 @@ class HTTPProxyActor:
            host,
            port,
            controller_name,
-            http_middlewares: List["starlette.middleware.Middleware"] = []):
+            http_middlewares: List[
+                "starlette.middleware.Middleware"] = []):  # noqa: F821
        self.host = host
        self.port = port

-        self.app = HTTPProxy()
-        await self.app.fetch_config_from_controller(controller_name)
+        self.app = HTTPProxy(controller_name)
+        await self.app.setup()

        self.wrapped_app = self.app
        for middleware in http_middlewares:
@@ -180,12 +196,3 @@ class HTTPProxyActor:
        # the main thread and uvicorn doesn't expose a way to configure it.
        server.install_signal_handlers = lambda: None
        await server.serve(sockets=[sock])
-
-    async def set_route_table(self, route_table):
-        self.app.set_route_table(route_table)
-
-    # ------ Proxy router logic ------ #
-    async def assign_request(self, request_meta, *request_args,
-                             **request_kwargs):
-        return await (await self.app.router.assign_request(
-            request_meta, *request_args, **request_kwargs))
@@ -117,7 +117,7 @@ class Response:
        elif content_type == "json":
            self.raw_headers.append([b"content-type", b"application/json"])
        else:
-            raise ValueError("Invalid content type {}".foramt(content_type))
+            raise ValueError("Invalid content type {}".format(content_type))

    async def send(self, scope, receive, send):
        await send({
@@ -1,4 +1,5 @@
 import asyncio
+from inspect import iscoroutinefunction
 import random
 from collections import defaultdict
 from dataclasses import dataclass
@@ -22,7 +23,7 @@ class UpdatedObject:
 UpdateStateAsyncCallable = Callable[[Any], Awaitable[None]]


-class LongPollerAsyncClient:
+class LongPollAsyncClient:
    """The asynchronous long polling client.

    Internally, it runs `await object_ref` in a `while True` loop. When a
@@ -31,7 +32,7 @@ class LongPollerAsyncClient:
    the next poll.

    Args:
-        host_actor(ray.ActorHandle): handle to actor embedding LongPollerHost.
+        host_actor(ray.ActorHandle): handle to actor embedding LongPollHost.
        key_listeners(Dict[str, AsyncCallable]): a dictionary mapping keys to
          callbacks to be called on state update for the corresponding keys.
    """
@@ -40,6 +41,10 @@ class LongPollerAsyncClient:
                 key_listeners: Dict[str, UpdateStateAsyncCallable]) -> None:
        self.host_actor = host_actor
        self.key_listeners = key_listeners
+        for callback in key_listeners.values():
+            if not iscoroutinefunction(callback):
+                raise ValueError(
+                    "Callbacks to async long poller must be 'async def'.")

        self.snapshot_ids: Dict[str, int] = {
            key: -1
@@ -56,34 +61,31 @@ class LongPollerAsyncClient:
            self.snapshot_ids)
        return object_ref

-    def _update(self, updates: Dict[str, UpdatedObject]):
-        for key, update in updates.items():
-            self.object_snapshots[key] = update.object_snapshot
-            self.snapshot_ids[key] = update.snapshot_id
-
    async def _do_long_poll(self):
        while True:
            try:
                updates: Dict[str, UpdatedObject] = await self._poll_once()
-                self._update(updates)
-                logger.debug(f"LongPollerClient received udpates: {updates}")
-                for key, updated_object in updates.items():
+                logger.debug("LongPollClient received updates for keys: "
+                             f"{list(updates.keys())}.")
+                for key, update in updates.items():
+                    self.object_snapshots[key] = update.object_snapshot
+                    self.snapshot_ids[key] = update.snapshot_id
                    # NOTE(simon):
                    # This blocks the loop from doing another poll. Consider
                    # use loop.create_task here or poll first then call the
                    # callbacks.
                    callback = self.key_listeners[key]
-                    await callback(updated_object.object_snapshot)
+                    await callback(update.object_snapshot)
            except ray.exceptions.RayActorError:
                # This can happen during shutdown where the controller is
                # intentionally killed, the client should just gracefully
                # exit.
-                logger.debug("LongPollerClient failed to connect to host. "
+                logger.debug("LongPollClient failed to connect to host. "
                             "Shutting down.")
                break


-class LongPollerHost:
+class LongPollHost:
    """The server side object that manages long pulling requests.

    The desired use case is to embed this in an Ray actor. Client will be
@@ -115,11 +117,10 @@ class LongPollerHost:
        immediately if the snapshot_ids are outdated, otherwise it will block
        until there's one updates.
        """
-        # 1. Figure out which keys do we care about
-        watched_keys = set(self.snapshot_ids.keys()).intersection(
-            keys_to_snapshot_ids.keys())
-        if len(watched_keys) == 0:
-            raise ValueError("Keys not found.")
+        watched_keys = keys_to_snapshot_ids.keys()
+        nonexistent_keys = set(watched_keys) - set(self.snapshot_ids.keys())
+        if len(nonexistent_keys) > 0:
+            raise ValueError(f"Keys not found: {nonexistent_keys}.")

        # 2. If there are any outdated keys (by comparing snapshot ids)
        #    return immediately.
@@ -159,7 +160,7 @@ class LongPollerHost:
    def notify_changed(self, object_key: str, updated_object: Any):
        self.snapshot_ids[object_key] += 1
        self.object_snapshots[object_key] = updated_object
-        logger.debug(f"LongPollerHost: {object_key} = {updated_object}")
+        logger.debug(f"LongPollHost: Notify change for key {object_key}.")

        if object_key in self.notifier_events:
            for event in self.notifier_events.pop(object_key):
@@ -6,9 +6,10 @@ from typing import Any, DefaultDict, Dict, Iterable, List, Optional

 import ray
 from ray.actor import ActorHandle
+from ray.serve.constants import LongPollKey
 from ray.serve.context import TaskContext
 from ray.serve.endpoint_policy import EndpointPolicy, RandomEndpointPolicy
-from ray.serve.long_poll import LongPollerAsyncClient
+from ray.serve.long_poll import LongPollAsyncClient
 from ray.serve.utils import logger
 from ray.util import metrics

@@ -106,7 +107,8 @@ class ReplicaSet:
                   ) >= self.max_concurrent_queries:
                # This replica is overloaded, try next one
                continue
-            logger.debug(f"Replica set assigned {query} to {replica}")
+            logger.debug(f"Assigned query {query.metadata.request_id} "
+                         f"to replica {replica}.")
            ref = replica.handle_request.remote(query)
            self.in_flight_queries[replica].add(ref)
            return ref
@@ -133,7 +135,8 @@ class ReplicaSet:
        """
        assigned_ref = self._try_assign_replica(query)
        while assigned_ref is None:  # Can't assign a replica right now.
-            logger.debug(f"Failed to assign a replica for query {query}")
+            logger.debug("Failed to assign a replica for "
+                         f"query {query.metadata.request_id}")
            # Maybe there exists a free replica, we just need to refresh our
            # query tracker.
            num_finished = self._drain_completed_object_refs()
@@ -141,7 +144,7 @@ class ReplicaSet:
            # config to be updated.
            if num_finished == 0:
                logger.debug(
-                    f"All replicas are busy, waiting for a free replica.")
+                    "All replicas are busy, waiting for a free replica.")
                await asyncio.wait(
                    self._all_query_refs + [self.config_updated_event.wait()],
                    return_when=asyncio.FIRST_COMPLETED)
@@ -176,14 +179,14 @@ class Router:

    async def setup_in_async_loop(self):
        # NOTE(simon): Instead of performing initialization in __init__,
-        # We separated the init of LongPollerAsyncClient to this method because
-        # __init__ might be called in sync context. LongPollerAsyncClient
+        # We separated the init of LongPollAsyncClient to this method because
+        # __init__ might be called in sync context. LongPollAsyncClient
        # requires async context.
-        self.long_pull_client = LongPollerAsyncClient(
+        self.long_poll_client = LongPollAsyncClient(
            self.controller, {
-                "traffic_policies": self._update_traffic_policies,
-                "worker_handles": self._update_worker_handles,
-                "backend_configs": self._update_backend_configs,
+                LongPollKey.TRAFFIC_POLICIES: self._update_traffic_policies,
+                LongPollKey.REPLICA_HANDLES: self._update_replica_handles,
+                LongPollKey.BACKEND_CONFIGS: self._update_backend_configs,
            })

    async def _update_traffic_policies(self, traffic_policies):
@@ -194,8 +197,8 @@ class Router:
                event = self._pending_endpoints.pop(endpoint)
                event.set()

-    async def _update_worker_handles(self, worker_handles):
-        for backend_tag, replica_handles in worker_handles.items():
+    async def _update_replica_handles(self, replica_handles):
+        for backend_tag, replica_handles in replica_handles.items():
            self.backend_replicas[backend_tag].update_worker_replicas(
                replica_handles)

@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+import click
+
+import ray
+from ray import serve
+from ray.serve.constants import DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT
+
+
+@click.group(
+    help="[EXPERIMENTAL] CLI for managing Serve instances on a Ray cluster.")
+@click.option(
+    "--address",
+    "-a",
+    default="auto",
+    required=False,
+    type=str,
+    help="Address of the running Ray cluster to connect to. "
+    "Defaults to \"auto\".")
+def cli(address):
+    ray.init(address=address)
+
+
+@cli.command(help="Start a detached Serve instance on the Ray cluster.")
+@click.option(
+    "--http-host",
+    default=DEFAULT_HTTP_HOST,
+    required=False,
+    type=str,
+    help="Host for HTTP servers to listen on. "
+    f"Defaults to {DEFAULT_HTTP_HOST}.")
+@click.option(
+    "--http-port",
+    default=DEFAULT_HTTP_PORT,
+    required=False,
+    type=int,
+    help="Port for HTTP servers to listen on. "
+    f"Defaults to {DEFAULT_HTTP_PORT}.")
+def start(http_host, http_port):
+    serve.start(detached=True, http_host=http_host, http_port=http_port)
+
+
+@cli.command(help="Shutdown the running Serve instance on the Ray cluster.")
+def shutdown():
+    serve.connect().shutdown()
@@ -7,6 +7,7 @@ import pytest
 import ray
 from ray import serve
 from ray.serve.config import BackendConfig
+from ray.serve.constants import LongPollKey

 if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False) == 1:
    serve.controller._CRASH_AFTER_CHECKPOINT_PROBABILITY = 0.5
@@ -42,22 +43,22 @@ def mock_controller_with_name():
    @ray.remote(num_cpus=0)
    class MockControllerActor:
        def __init__(self):
-            from ray.serve.long_poll import LongPollerHost
-            self.host = LongPollerHost()
+            from ray.serve.long_poll import LongPollHost
+            self.host = LongPollHost()
            self.backend_replicas = defaultdict(list)
            self.backend_configs = dict()
            self.clear()

        def clear(self):
-            self.host.notify_changed("worker_handles", {})
-            self.host.notify_changed("traffic_policies", {})
-            self.host.notify_changed("backend_configs", {})
+            self.host.notify_changed(LongPollKey.REPLICA_HANDLES, {})
+            self.host.notify_changed(LongPollKey.TRAFFIC_POLICIES, {})
+            self.host.notify_changed(LongPollKey.BACKEND_CONFIGS, {})

        async def listen_for_change(self, snapshot_ids):
            return await self.host.listen_for_change(snapshot_ids)

        def set_traffic(self, endpoint, traffic_policy):
-            self.host.notify_changed("traffic_policies",
+            self.host.notify_changed(LongPollKey.TRAFFIC_POLICIES,
                                     {endpoint: traffic_policy})

        def add_new_replica(self,
@@ -68,15 +69,17 @@ def mock_controller_with_name():
            self.backend_configs[backend_tag] = backend_config

            self.host.notify_changed(
-                "worker_handles",
+                LongPollKey.REPLICA_HANDLES,
                self.backend_replicas,
            )
-            self.host.notify_changed("backend_configs", self.backend_configs)
+            self.host.notify_changed(LongPollKey.BACKEND_CONFIGS,
+                                     self.backend_configs)

        def update_backend(self, backend_tag: str,
                           backend_config: BackendConfig):
            self.backend_configs[backend_tag] = backend_config
-            self.host.notify_changed("backend_configs", self.backend_configs)
+            self.host.notify_changed(LongPollKey.BACKEND_CONFIGS,
+                                     self.backend_configs)

    name = f"MockController{random.randint(0,10e4)}"
    yield name, MockControllerActor.options(name=name).remote()
@@ -4,6 +4,7 @@ import time
 import os
 import pytest
 import requests
+import starlette.responses

 import ray
 from ray import serve
@@ -25,22 +26,6 @@ def test_e2e(serve_instance):
    client.create_endpoint(
        "endpoint", backend="echo:v1", route="/api", methods=["GET", "POST"])

-    retry_count = 5
-    timeout_sleep = 0.5
-    while True:
-        try:
-            resp = requests.get(
-                "http://127.0.0.1:8000/-/routes", timeout=0.5).json()
-            assert resp == {"/api": ["endpoint", ["GET", "POST"]]}
-            break
-        except Exception as e:
-            time.sleep(timeout_sleep)
-            timeout_sleep *= 2
-            retry_count -= 1
-            if retry_count == 0:
-                assert False, ("Route table hasn't been updated after 3 tries."
-                               "The latest error was {}").format(e)
-
    resp = requests.get("http://127.0.0.1:8000/api").json()["method"]
    assert resp == "GET"

@@ -48,6 +33,63 @@ def test_e2e(serve_instance):
    assert resp == "POST"


+def test_starlette_response(serve_instance):
+    client = serve_instance
+
+    def basic_response(_):
+        return starlette.responses.Response(
+            "Hello, world!", media_type="text/plain")
+
+    client.create_backend("basic_response", basic_response)
+    client.create_endpoint(
+        "basic_response", backend="basic_response", route="/basic_response")
+    assert requests.get(
+        "http://127.0.0.1:8000/basic_response").text == "Hello, world!"
+
+    def html_response(_):
+        return starlette.responses.HTMLResponse(
+            "<html><body><h1>Hello, world!</h1></body></html>")
+
+    client.create_backend("html_response", html_response)
+    client.create_endpoint(
+        "html_response", backend="html_response", route="/html_response")
+    assert requests.get(
+        "http://127.0.0.1:8000/html_response"
+    ).text == "<html><body><h1>Hello, world!</h1></body></html>"
+
+    def plain_text_response(_):
+        return starlette.responses.PlainTextResponse("Hello, world!")
+
+    client.create_backend("plain_text_response", plain_text_response)
+    client.create_endpoint(
+        "plain_text_response",
+        backend="plain_text_response",
+        route="/plain_text_response")
+    assert requests.get(
+        "http://127.0.0.1:8000/plain_text_response").text == "Hello, world!"
+
+    def json_response(_):
+        return starlette.responses.JSONResponse({"hello": "world"})
+
+    client.create_backend("json_response", json_response)
+    client.create_endpoint(
+        "json_response", backend="json_response", route="/json_response")
+    assert requests.get("http://127.0.0.1:8000/json_response").json()[
+        "hello"] == "world"
+
+    def redirect_response(_):
+        return starlette.responses.RedirectResponse(
+            url="http://127.0.0.1:8000/basic_response")
+
+    client.create_backend("redirect_response", redirect_response)
+    client.create_endpoint(
+        "redirect_response",
+        backend="redirect_response",
+        route="/redirect_response")
+    assert requests.get(
+        "http://127.0.0.1:8000/redirect_response").text == "Hello, world!"
+
+
 def test_backend_user_config(serve_instance):
    client = serve_instance

@@ -63,25 +105,26 @@ def test_backend_user_config(serve_instance):

    config = BackendConfig(num_replicas=2, user_config={"count": 123, "b": 2})
    client.create_backend("counter", Counter, config=config)
-    client.create_endpoint("counter", backend="counter", route="/counter")
+    client.create_endpoint("counter", backend="counter")
    handle = client.get_handle("counter")

    def check(val, num_replicas):
        pids_seen = set()
        for i in range(100):
            result = ray.get(handle.remote())
-            assert (str(result[0]) == val), result[0]
+            if str(result[0]) != val:
+                return False
            pids_seen.add(result[1])
-        assert (len(pids_seen) == num_replicas)
+        return len(pids_seen) == num_replicas

-    check("123", 2)
+    wait_for_condition(lambda: check("123", 2))

    client.update_backend_config("counter", BackendConfig(num_replicas=3))
-    check("123", 3)
+    wait_for_condition(lambda: check("123", 3))

    config = BackendConfig(user_config={"count": 456})
    client.update_backend_config("counter", config)
-    check("456", 3)
+    wait_for_condition(lambda: check("456", 3))


 def test_call_method(serve_instance):
@@ -183,7 +226,7 @@ def test_reject_duplicate_endpoint_and_route(serve_instance):
 def test_no_http(serve_instance):
    client = serve.start(http_host=None)

-    assert len(ray.get(client._controller.get_routers.remote())) == 0
+    assert len(ray.get(client._controller.get_http_proxies.remote())) == 0

    def hello(*args):
        return "hello"
@@ -223,11 +266,6 @@ def test_scaling_replicas(serve_instance):

    client.create_endpoint("counter", backend="counter:v1", route="/increment")

-    # Keep checking the routing table until /increment is populated
-    while "/increment" not in requests.get(
-            "http://127.0.0.1:8000/-/routes").json():
-        time.sleep(0.2)
-
    counter_result = []
    for _ in range(10):
        resp = requests.get("http://127.0.0.1:8000/increment").json()
@@ -267,11 +305,6 @@ def test_batching(serve_instance):
    client.create_endpoint(
        "counter1", backend="counter:v11", route="/increment2")

-    # Keep checking the routing table until /increment is populated
-    while "/increment2" not in requests.get(
-            "http://127.0.0.1:8000/-/routes").json():
-        time.sleep(0.2)
-
    future_list = []
    handle = client.get_handle("counter1")
    for _ in range(20):
@@ -299,8 +332,7 @@ def test_batching_exception(serve_instance):
    # Set the max batch size.
    config = BackendConfig(max_batch_size=5)
    client.create_backend("exception:v1", NoListReturned, config=config)
-    client.create_endpoint(
-        "exception-test", backend="exception:v1", route="/noListReturned")
+    client.create_endpoint("exception-test", backend="exception:v1")

    handle = client.get_handle("exception-test")
    with pytest.raises(ray.exceptions.RayTaskError):
@@ -323,16 +355,16 @@ def test_updating_config(serve_instance):
    client.create_endpoint("bsimple", backend="bsimple:v1", route="/bsimple")

    controller = client._controller
-    old_replica_tag_list = ray.get(
-        controller._list_replicas.remote("bsimple:v1"))
+    old_replica_tag_list = list(
+        ray.get(controller._all_replica_handles.remote())["bsimple:v1"].keys())

    update_config = BackendConfig(max_batch_size=5)
    client.update_backend_config("bsimple:v1", update_config)
-    new_replica_tag_list = ray.get(
-        controller._list_replicas.remote("bsimple:v1"))
+    new_replica_tag_list = list(
+        ray.get(controller._all_replica_handles.remote())["bsimple:v1"].keys())
    new_all_tag_list = []
    for worker_dict in ray.get(
-            controller.get_all_replica_handles.remote()).values():
+            controller._all_replica_handles.remote()).values():
        new_all_tag_list.extend(list(worker_dict.keys()))

    # the old and new replica tag list should be identical
@@ -648,7 +680,7 @@ def test_create_infeasible_error(serve_instance):
                "MagicMLResource": 100
            }})

-    # Even each replica might be feasible, the total might not be.
+    # Even though each replica might be feasible, the total might not be.
    current_cpus = int(ray.nodes()[0]["Resources"]["CPU"])
    num_replicas = current_cpus + 20
    config = BackendConfig(num_replicas=num_replicas)
@@ -661,10 +693,6 @@ def test_create_infeasible_error(serve_instance):
            }},
            config=config)

-    # No replica should be created!
-    replicas = ray.get(client._controller._list_replicas.remote("f1"))
-    assert len(replicas) == 0
-

 def test_shutdown():
    def f():
@@ -797,6 +825,7 @@ def test_serve_metrics(serve_instance):

    client.create_backend("metrics", batcher)
    client.create_endpoint("metrics", backend="metrics", route="/metrics")
+
    # send 10 concurrent requests
    url = "http://127.0.0.1:8000/metrics"
    ray.get([block_until_http_ready.remote(url) for _ in range(10)])
@@ -48,7 +48,7 @@ def setup_worker(name,
 async def add_servable_to_router(servable, router, controller_name, **kwargs):
    worker = setup_worker(
        "backend", servable, controller_name=controller_name, **kwargs)
-    await router._update_worker_handles.remote({"backend": [worker]})
+    await router._update_replica_handles.remote({"backend": [worker]})
    await router._update_traffic_policies.remote({
        "endpoint": TrafficPolicy({
            "backend": 1.0
@@ -0,0 +1,23 @@
+import pytest
+
+import ray
+
+
+def test_controller_inflight_requests_clear(serve_instance):
+    client = serve_instance
+    initial_number_reqs = ray.get(
+        client._controller._num_inflight_results.remote())
+
+    def function(_):
+        return "hello"
+
+    client.create_backend("tst", function)
+    client.create_endpoint("end_pt", backend="tst")
+
+    assert ray.get(client._controller._num_inflight_results.remote()
+                   ) - initial_number_reqs == 0
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(pytest.main(["-v", "-s", __file__]))
@@ -4,6 +4,7 @@ import tempfile
 import time

 import ray
+from ray.test_utils import wait_for_condition
 from ray import serve
 from ray.serve.config import BackendConfig, ReplicaConfig

@@ -53,9 +54,11 @@ def test_controller_failure(serve_instance):
    client.create_backend("controller_failure:v2", function)
    client.set_traffic("controller_failure", {"controller_failure:v2": 1.0})

-    for _ in range(10):
+    def check_controller_failure():
        response = request_with_retries("/controller_failure", timeout=30)
-        assert response.text == "hello2"
+        return response.text == "hello2"
+
+    wait_for_condition(check_controller_failure)

    def function(_):
        return "hello3"
@@ -76,10 +79,10 @@ def test_controller_failure(serve_instance):
        assert response.text == "hello3"


-def _kill_routers(client):
-    routers = ray.get(client._controller.get_routers.remote())
-    for router in routers.values():
-        ray.kill(router, no_restart=False)
+def _kill_http_proxies(client):
+    http_proxies = ray.get(client._controller.get_http_proxies.remote())
+    for http_proxy in http_proxies.values():
+        ray.kill(http_proxy, no_restart=False)


 def test_http_proxy_failure(serve_instance):
@@ -98,7 +101,7 @@ def test_http_proxy_failure(serve_instance):
        response = request_with_retries("/proxy_failure", timeout=30)
        assert response.text == "hello1"

-    _kill_routers(client)
+    _kill_http_proxies(client)

    def function(_):
        return "hello2"
@@ -113,7 +116,7 @@ def test_http_proxy_failure(serve_instance):

 def _get_worker_handles(client, backend):
    controller = client._controller
-    backend_dict = ray.get(controller.get_all_replica_handles.remote())
+    backend_dict = ray.get(controller._all_replica_handles.remote())

    return list(backend_dict[backend].values())

@@ -124,7 +127,7 @@ def test_worker_restart(serve_instance):
    client = serve_instance

    class Worker1:
-        def __call__(self):
+        def __call__(self, *args):
            return os.getpid()

    client.create_backend("worker_failure:v1", Worker1)
@@ -176,7 +179,7 @@ def test_worker_replica_failure(serve_instance):
                while True:
                    pass

-        def __call__(self):
+        def __call__(self, *args):
            pass

    temp_path = os.path.join(tempfile.gettempdir(),
@@ -1,5 +1,4 @@
 import sys
-import functools
 import time
 import asyncio
 import os
@@ -8,12 +7,12 @@ from typing import Dict
 import pytest

 import ray
-from ray.serve.long_poll import (LongPollerAsyncClient, LongPollerHost,
+from ray.serve.long_poll import (LongPollAsyncClient, LongPollHost,
                                 UpdatedObject)


 def test_host_standalone(serve_instance):
-    host = ray.remote(LongPollerHost).remote()
+    host = ray.remote(LongPollHost).remote()

    # Write two values
    ray.get(host.notify_changed.remote("key_1", 999))
@@ -44,10 +43,10 @@ def test_long_poll_restarts(serve_instance):
        max_restarts=-1,
        max_task_retries=-1,
    )
-    class RestartableLongPollerHost:
+    class RestartableLongPollHost:
        def __init__(self) -> None:
            print("actor started")
-            self.host = LongPollerHost()
+            self.host = LongPollHost()
            self.host.notify_changed("timer", time.time())
            self.should_exit = False

@@ -63,7 +62,7 @@ def test_long_poll_restarts(serve_instance):
                print("actor exit")
                os._exit(1)

-    host = RestartableLongPollerHost.remote()
+    host = RestartableLongPollHost.remote()
    updated_values = ray.get(host.listen_for_change.remote({"timer": -1}))
    timer: UpdatedObject = updated_values["timer"]

@@ -81,22 +80,31 @@ def test_long_poll_restarts(serve_instance):

@pytest.mark.asyncio
 async def test_async_client(serve_instance):
-    host = ray.remote(LongPollerHost).remote()
+    host = ray.remote(LongPollHost).remote()

    # Write two values
    ray.get(host.notify_changed.remote("key_1", 100))
    ray.get(host.notify_changed.remote("key_2", 999))

+    # Check that construction fails with a sync callback.
+    def callback(result, key):
+        pass
+
+    with pytest.raises(ValueError):
+        client = LongPollAsyncClient(host, {"key": callback})
+
    callback_results = dict()

-    async def callback(result, key):
-        callback_results[key] = result
+    async def key_1_callback(result):
+        callback_results["key_1"] = result

-    client = LongPollerAsyncClient(
-        host, {
-            "key_1": functools.partial(callback, key="key_1"),
-            "key_2": functools.partial(callback, key="key_2")
-        })
+    async def key_2_callback(result):
+        callback_results["key_2"] = result
+
+    client = LongPollAsyncClient(host, {
+        "key_1": key_1_callback,
+        "key_2": key_2_callback,
+    })

    while len(client.object_snapshots) == 0:
        # Yield the loop for client to get the result
@@ -144,6 +144,7 @@ class ServeEncoder(json.JSONEncoder):
@ray.remote(num_cpus=0)
 def block_until_http_ready(http_endpoint,
                           backoff_time_s=1,
+                           check_ready=None,
                           timeout=HTTP_PROXY_TIMEOUT):
    http_is_ready = False
    start_time = time.time()
@@ -152,7 +153,10 @@ def block_until_http_ready(http_endpoint,
        try:
            resp = requests.get(http_endpoint)
            assert resp.status_code == 200
-            http_is_ready = True
+            if check_ready is None:
+                http_is_ready = True
+            else:
+                http_is_ready = check_ready(resp)
        except Exception:
            pass

@@ -95,6 +95,8 @@ py_test_module_list(
    "test_dask_callback.py",
    "test_debug_tools.py",
    "test_experimental_client.py",
+    "test_experimental_client_metadata.py",
+    "test_experimental_client_terminate.py",
    "test_job.py",
    "test_memstat.py",
    "test_metrics_agent.py",
@@ -1 +1 @@
-ray[debug]
+ray
@@ -1 +1 @@
-ray[debug]
+ray
@@ -1 +1 @@
-ray[debug]
+ray
@@ -8,6 +8,7 @@ try:
 except ImportError:
    pytest_timeout = None
 import sys
+import tempfile
 import datetime

 import ray
@@ -867,5 +868,61 @@ def test_actor_creation_latency(ray_start_regular_shared):
        actor_create_time - start, end - start))


+@pytest.mark.parametrize(
+    "exit_condition",
+    [
+        # "out_of_scope", TODO(edoakes): enable this once fixed.
+        "__ray_terminate__",
+        "ray.actor.exit_actor",
+        "ray.kill"
+    ])
+def test_atexit_handler(ray_start_regular_shared, exit_condition):
+    @ray.remote
+    class A():
+        def __init__(self, tmpfile, data):
+            import atexit
+
+            def f(*args, **kwargs):
+                with open(tmpfile, "w") as f:
+                    f.write(data)
+                    f.flush()
+
+            atexit.register(f)
+
+        def ready(self):
+            pass
+
+        def exit(self):
+            ray.actor.exit_actor()
+
+    data = "hello"
+    tmpfile = tempfile.NamedTemporaryFile()
+    a = A.remote(tmpfile.name, data)
+    ray.get(a.ready.remote())
+
+    if exit_condition == "out_of_scope":
+        del a
+    elif exit_condition == "__ray_terminate__":
+        ray.wait([a.__ray_terminate__.remote()])
+    elif exit_condition == "ray.actor.exit_actor":
+        ray.wait([a.exit.remote()])
+    elif exit_condition == "ray.kill":
+        ray.kill(a)
+    else:
+        assert False, "Unrecognized condition"
+
+    def check_file_written():
+        with open(tmpfile.name) as f:
+            if f.read() == data:
+                return True
+            return False
+
+    # ray.kill() should not trigger atexit handlers, all other methods should.
+    if exit_condition == "ray.kill":
+        assert not check_file_written()
+    else:
+        ray.test_utils.wait_for_condition(check_file_written)
+
+
 if __name__ == "__main__":
    sys.exit(pytest.main(["-v", __file__]))
@@ -1055,11 +1055,11 @@ def test_actor_resource_demand(shutdown_only):
    ray.get(a.foo.remote())
    time.sleep(1)

-    message = global_state_accessor.get_all_heartbeat()
-    heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
+    message = global_state_accessor.get_all_resource_usage()
+    resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)

    # The actor is scheduled so there should be no more demands left.
-    assert len(heartbeat.resource_load_by_shape.resource_demands) == 0
+    assert len(resource_usages.resource_load_by_shape.resource_demands) == 0

    @ray.remote(num_cpus=80)
    class Actor2:
@@ -1070,23 +1070,24 @@ def test_actor_resource_demand(shutdown_only):
    time.sleep(1)

    # This actor cannot be scheduled.
-    message = global_state_accessor.get_all_heartbeat()
-    heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
-    assert len(heartbeat.resource_load_by_shape.resource_demands) == 1
-    assert (heartbeat.resource_load_by_shape.resource_demands[0].shape == {
-        "CPU": 80.0
-    })
-    assert (heartbeat.resource_load_by_shape.resource_demands[0]
+    message = global_state_accessor.get_all_resource_usage()
+    resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
+    assert len(resource_usages.resource_load_by_shape.resource_demands) == 1
+    assert (
+        resource_usages.resource_load_by_shape.resource_demands[0].shape == {
+            "CPU": 80.0
+        })
+    assert (resource_usages.resource_load_by_shape.resource_demands[0]
            .num_infeasible_requests_queued == 1)

    actors.append(Actor2.remote())
    time.sleep(1)

    # Two actors cannot be scheduled.
-    message = global_state_accessor.get_all_heartbeat()
-    heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
-    assert len(heartbeat.resource_load_by_shape.resource_demands) == 1
-    assert (heartbeat.resource_load_by_shape.resource_demands[0]
+    message = global_state_accessor.get_all_resource_usage()
+    resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
+    assert len(resource_usages.resource_load_by_shape.resource_demands) == 1
+    assert (resource_usages.resource_load_by_shape.resource_demands[0]
            .num_infeasible_requests_queued == 2)

    global_state_accessor.disconnect()
@@ -1,3 +1,4 @@
+import asyncio
 import collections
 import numpy as np
 import os
@@ -211,6 +212,66 @@ def test_actor_restart_with_retry(ray_init_with_task_retry_delay):
        ray.get(actor.increase.remote())


+def test_named_actor_max_task_retries(ray_init_with_task_retry_delay):
+    @ray.remote(num_cpus=0)
+    class Counter:
+        def __init__(self):
+            self.count = 0
+            self.event = asyncio.Event()
+
+        def increment(self):
+            self.count += 1
+            self.event.set()
+
+        async def wait_for_count(self, count):
+            while True:
+                if self.count >= count:
+                    return
+                await self.event.wait()
+                self.event.clear()
+
+    @ray.remote
+    class ActorToKill:
+        def __init__(self, counter):
+            counter.increment.remote()
+
+        def run(self, counter, signal):
+            counter.increment.remote()
+            ray.get(signal.wait.remote())
+
+    @ray.remote
+    class CallingActor:
+        def __init__(self):
+            self.actor = ray.get_actor("a")
+
+        def call_other(self, counter, signal):
+            return ray.get(self.actor.run.remote(counter, signal))
+
+    init_counter = Counter.remote()
+    run_counter = Counter.remote()
+    signal = SignalActor.remote()
+
+    # Start the two actors, wait for ActorToKill's constructor to run.
+    a = ActorToKill.options(
+        name="a", max_restarts=-1, max_task_retries=-1).remote(init_counter)
+    c = CallingActor.remote()
+    ray.get(init_counter.wait_for_count.remote(1), timeout=30)
+
+    # Signal the CallingActor to call ActorToKill, wait for it to be running,
+    # then kill ActorToKill.
+    # Verify that this causes ActorToKill's constructor to run a second time
+    # and the run method to begin a second time.
+    ref = c.call_other.remote(run_counter, signal)
+    ray.get(run_counter.wait_for_count.remote(1), timeout=30)
+    ray.kill(a, no_restart=False)
+    ray.get(init_counter.wait_for_count.remote(2), timeout=30)
+    ray.get(run_counter.wait_for_count.remote(2), timeout=30)
+
+    # Signal the run method to finish, verify that the CallingActor returns.
+    signal.send.remote()
+    ray.get(ref, timeout=30)
+
+
 def test_actor_restart_on_node_failure(ray_start_cluster):
    config = {
        "num_heartbeats_timeout": 10,
@@ -94,8 +94,13 @@ def test_local_scheduling_first(ray_start_cluster):
        assert local()


-@pytest.mark.skipif(new_scheduler_enabled(), reason="flakes more often")
-def test_load_balancing_with_dependencies(ray_start_cluster):
+@pytest.mark.parametrize("fast", [True, False])
+def test_load_balancing_with_dependencies(ray_start_cluster, fast):
+    if fast and new_scheduler_enabled:
+        # Load-balancing on new scheduler can be inefficient if (task
+        # duration:heartbeat interval) is small enough.
+        pytest.skip()
+
    # This test ensures that tasks are being assigned to all raylets in a
    # roughly equal manner even when the tasks have dependencies.
    cluster = ray_start_cluster
@@ -106,7 +111,10 @@ def test_load_balancing_with_dependencies(ray_start_cluster):

    @ray.remote
    def f(x):
-        time.sleep(0.010)
+        if fast:
+            time.sleep(0.010)
+        else:
+            time.sleep(0.1)
        return ray.worker.global_worker.node.unique_id

    # This object will be local to one of the raylets. Make sure
@@ -198,6 +198,32 @@ async def test_asyncio_double_await(ray_start_regular_shared):
    await waiting


+@pytest.mark.asyncio
+async def test_asyncio_exit_actor(ray_start_regular_shared):
+    # https://github.com/ray-project/ray/issues/12649
+    # The test should just hang without the fix.
+
+    @ray.remote
+    class Actor:
+        async def exit(self):
+            ray.actor.exit_actor()
+
+        async def ping(self):
+            return "pong"
+
+        async def loop_forever(self):
+            while True:
+                await asyncio.sleep(5)
+
+    a = Actor.options(max_task_retries=0).remote()
+    a.loop_forever.remote()
+    # Make sure exit_actor exits immediately, not once all tasks completed.
+    ray.get(a.exit.remote())
+
+    with pytest.raises(ray.exceptions.RayActorError):
+        ray.get(a.ping.remote())
+
+
 if __name__ == "__main__":
    import pytest
    sys.exit(pytest.main(["-v", __file__]))
@@ -537,6 +537,7 @@ class AutoscalingTest(unittest.TestCase):
        self.provider = MockProvider()
        self.provider.create_node({}, {TAG_RAY_NODE_KIND: "worker"}, 10)
        runner = MockProcessRunner()
+        runner.respond_to_call("json .Config.Env", ["[]" for i in range(10)])
        autoscaler = StandardAutoscaler(
            config_path,
            LoadMetrics(),
@@ -558,6 +559,7 @@ class AutoscalingTest(unittest.TestCase):
        config_path = self.write_config(SMALL_CLUSTER)
        self.provider = MockProvider()
        runner = MockProcessRunner()
+        runner.respond_to_call("json .Config.Env", ["[]" for i in range(11)])
        lm = LoadMetrics()
        autoscaler = StandardAutoscaler(
            config_path,
@@ -613,6 +615,70 @@ class AutoscalingTest(unittest.TestCase):
        autoscaler.update()
        self.waitForNodes(0)

+    def testLegacyYamlWithRequestResources(self):
+        """Test when using legacy yamls request_resources() adds workers.
+
+        Makes sure that requested resources are added for legacy yamls when
+        necessary. So if requested resources for instance fit on the headnode
+        we don't add more nodes. But we add more nodes when they don't fit.
+        """
+        config = SMALL_CLUSTER.copy()
+        config["min_workers"] = 0
+        config["max_workers"] = 100
+        config["idle_timeout_minutes"] = 0
+        config["upscaling_speed"] = 1
+        config_path = self.write_config(config)
+
+        self.provider = MockProvider()
+        self.provider.create_node({}, {
+            TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
+            TAG_RAY_USER_NODE_TYPE: NODE_TYPE_LEGACY_HEAD
+        }, 1)
+        head_ip = self.provider.non_terminated_node_ips(
+            tag_filters={TAG_RAY_NODE_KIND: NODE_KIND_HEAD}, )[0]
+        runner = MockProcessRunner()
+        runner.respond_to_call("json .Config.Env", ["[]" for i in range(10)])
+
+        lm = LoadMetrics()
+        lm.local_ip = head_ip
+        lm.update(head_ip, {"CPU": 1}, {"CPU": 1}, {})
+        autoscaler = StandardAutoscaler(
+            config_path,
+            lm,
+            max_launch_batch=5,
+            max_concurrent_launches=5,
+            max_failures=0,
+            process_runner=runner,
+            update_interval_s=0)
+        autoscaler.update()
+        # 1 head node.
+        self.waitForNodes(1)
+        autoscaler.request_resources([{"CPU": 1}])
+        autoscaler.update()
+        # still 1 head node because request_resources fits in the headnode.
+        self.waitForNodes(1)
+        autoscaler.request_resources([{"CPU": 1}] + [{"CPU": 2}] * 9)
+        autoscaler.update()
+        self.waitForNodes(2)  # Adds a single worker to get its resources.
+        autoscaler.update()
+        self.waitForNodes(2)  # Still 1 worker because its resources
+        # aren't known.
+        lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 2}, {})
+        autoscaler.update()
+        self.waitForNodes(10)  # 9 workers and 1 head node, scaled immediately.
+        lm.update(
+            "172.0.0.1", {"CPU": 2}, {"CPU": 2}, {},
+            waiting_bundles=[{
+                "CPU": 2
+            }] * 9,
+            infeasible_bundles=[{
+                "CPU": 1
+            }] * 1)
+        autoscaler.update()
+        # Make sure that if all the resources fit on the exising nodes not
+        # to add any more.
+        self.waitForNodes(10)
+
    def testAggressiveAutoscaling(self):
        config = SMALL_CLUSTER.copy()
        config["min_workers"] = 0
@@ -629,7 +695,7 @@ class AutoscalingTest(unittest.TestCase):
        head_ip = self.provider.non_terminated_node_ips(
            tag_filters={TAG_RAY_NODE_KIND: NODE_KIND_HEAD}, )[0]
        runner = MockProcessRunner()
-
+        runner.respond_to_call("json .Config.Env", ["[]" for i in range(11)])
        lm = LoadMetrics()
        lm.local_ip = head_ip

@@ -782,6 +848,7 @@ class AutoscalingTest(unittest.TestCase):
        config_path = self.write_config(SMALL_CLUSTER)
        self.provider = MockProvider()
        runner = MockProcessRunner()
+        runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
        autoscaler = StandardAutoscaler(
            config_path,
            LoadMetrics(),
@@ -817,6 +884,7 @@ class AutoscalingTest(unittest.TestCase):
        config_path = self.write_config(config)
        self.provider = MockProvider()
        runner = MockProcessRunner()
+        runner.respond_to_call("json .Config.Env", ["[]" for i in range(10)])
        autoscaler = StandardAutoscaler(
            config_path,
            LoadMetrics(),
@@ -896,6 +964,7 @@ class AutoscalingTest(unittest.TestCase):
        config_path = self.write_config(SMALL_CLUSTER)
        self.provider = MockProvider()
        runner = MockProcessRunner()
+        runner.respond_to_call("json .Config.Env", ["[]" for i in range(10)])
        lm = LoadMetrics()
        autoscaler = StandardAutoscaler(
            config_path,
@@ -949,6 +1018,7 @@ class AutoscalingTest(unittest.TestCase):
        config_path = self.write_config(SMALL_CLUSTER)
        self.provider = MockProvider()
        runner = MockProcessRunner()
+        runner.respond_to_call("json .Config.Env", ["[]" for i in range(4)])
        autoscaler = StandardAutoscaler(
            config_path,
            LoadMetrics(),
@@ -989,6 +1059,7 @@ class AutoscalingTest(unittest.TestCase):
        config_path = self.write_config(config)
        self.provider = MockProvider()
        runner = MockProcessRunner(fail_cmds=["setup_cmd"])
+        runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
        autoscaler = StandardAutoscaler(
            config_path,
            LoadMetrics(),
@@ -1000,14 +1071,18 @@ class AutoscalingTest(unittest.TestCase):
        self.waitForNodes(2)
        self.provider.finish_starting_nodes()
        autoscaler.update()
-        self.waitForNodes(
-            2, tag_filters={TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED})
+        try:
+            self.waitForNodes(
+                2, tag_filters={TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED})
+        except AssertionError:
+            # The failed nodes might have been already terminated by autoscaler
+            assert len(self.provider.non_terminated_nodes({})) == 0

    def testConfiguresOutdatedNodes(self):
        config_path = self.write_config(SMALL_CLUSTER)
        self.provider = MockProvider()
        runner = MockProcessRunner()
-        runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
+        runner.respond_to_call("json .Config.Env", ["[]" for i in range(4)])
        autoscaler = StandardAutoscaler(
            config_path,
            LoadMetrics(),
@@ -1038,6 +1113,7 @@ class AutoscalingTest(unittest.TestCase):
        self.provider = MockProvider()
        lm = LoadMetrics()
        runner = MockProcessRunner()
+        runner.respond_to_call("json .Config.Env", ["[]" for i in range(5)])
        autoscaler = StandardAutoscaler(
            config_path,
            lm,
@@ -1087,12 +1163,22 @@ class AutoscalingTest(unittest.TestCase):
        autoscaler.update()

        assert autoscaler.pending_launches.value == 0
-        assert len(self.provider.non_terminated_nodes({})) == 3
+        # This actually remained 4 instead of 3, because the other 2 nodes
+        # are not connected and hence we rely more on connected nodes for
+        # min_workers. When the "pending" nodes show up as connected,
+        # then we can terminate the ones connected before.
+        assert len(self.provider.non_terminated_nodes({})) == 4
        lm.last_used_time_by_ip["172.0.0.2"] = 0
        lm.last_used_time_by_ip["172.0.0.3"] = 0
        autoscaler.update()
        assert autoscaler.pending_launches.value == 0
-        assert len(self.provider.non_terminated_nodes({})) == 1
+        # 2 nodes and not 1 because 1 is needed for min_worker and the other 1
+        # is still not connected.
+        self.waitForNodes(2)
+        # when we connect it, we will see 1 node.
+        lm.last_used_time_by_ip["172.0.0.4"] = 0
+        autoscaler.update()
+        self.waitForNodes(1)

    def testTargetUtilizationFraction(self):
        config = SMALL_CLUSTER.copy()
@@ -1103,6 +1189,7 @@ class AutoscalingTest(unittest.TestCase):
        self.provider = MockProvider()
        lm = LoadMetrics()
        runner = MockProcessRunner()
+        runner.respond_to_call("json .Config.Env", ["[]" for i in range(12)])
        autoscaler = StandardAutoscaler(
            config_path,
            lm,
@@ -1161,7 +1248,7 @@ class AutoscalingTest(unittest.TestCase):
        config_path = self.write_config(SMALL_CLUSTER)
        self.provider = MockProvider()
        runner = MockProcessRunner()
-        runner.respond_to_call("json .Config.Env", ["[]" for i in range(2)])
+        runner.respond_to_call("json .Config.Env", ["[]" for i in range(3)])
        lm = LoadMetrics()
        autoscaler = StandardAutoscaler(
            config_path,
@@ -1,4 +1,5 @@
 import pytest
+import sys

 import ray
 import ray.cluster_utils
@@ -6,7 +7,7 @@ import ray.test_utils


 def test_cross_language_raise_kwargs(shutdown_only):
-    ray.init(_load_code_from_local=True)
+    ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path))

    with pytest.raises(Exception, match="kwargs"):
        ray.java_function("a", "b").remote(x="arg1")
@@ -16,7 +17,7 @@ def test_cross_language_raise_kwargs(shutdown_only):


 def test_cross_language_raise_exception(shutdown_only):
-    ray.init(_load_code_from_local=True)
+    ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path))

    class PythonObject(object):
        pass
@@ -2,7 +2,7 @@ import pytest
 from contextlib import contextmanager

 import ray.experimental.client.server.server as ray_client_server
-from ray.experimental.client import ray
+from ray.experimental.client import ray, reset_api
 from ray.experimental.client.common import ClientObjectRef


@@ -10,9 +10,12 @@ from ray.experimental.client.common import ClientObjectRef
 def ray_start_client_server():
    server = ray_client_server.serve("localhost:50051", test_mode=True)
    ray.connect("localhost:50051")
-    yield ray
-    ray.disconnect()
-    server.stop(0)
+    try:
+        yield ray
+    finally:
+        ray.disconnect()
+        server.stop(0)
+        reset_api()


 def test_real_ray_fallback(ray_start_regular_shared):
@@ -34,9 +37,6 @@ def test_real_ray_fallback(ray_start_regular_shared):
        nodes = ray.get(get_nodes.remote())
        assert len(nodes) == 1, nodes

-        with pytest.raises(NotImplementedError):
-            print(ray.nodes())
-

 def test_nested_function(ray_start_regular_shared):
    with ray_start_client_server() as ray:
@@ -170,6 +170,70 @@ def test_basic_actor(ray_start_regular_shared):
        assert count == 2


+def test_pass_handles(ray_start_regular_shared):
+    """
+    Test that passing client handles to actors and functions to remote actors
+    in functions (on the server or raylet side) works transparently to the
+    caller.
+    """
+    with ray_start_client_server() as ray:
+
+        @ray.remote
+        class ExecActor:
+            def exec(self, f, x):
+                return ray.get(f.remote(x))
+
+            def exec_exec(self, actor, f, x):
+                return ray.get(actor.exec.remote(f, x))
+
+        @ray.remote
+        def fact(x):
+            out = 1
+            while x > 0:
+                out = out * x
+                x -= 1
+            return out
+
+        @ray.remote
+        def func_exec(f, x):
+            return ray.get(f.remote(x))
+
+        @ray.remote
+        def func_actor_exec(actor, f, x):
+            return ray.get(actor.exec.remote(f, x))
+
+        @ray.remote
+        def sneaky_func_exec(obj, x):
+            return ray.get(obj["f"].remote(x))
+
+        @ray.remote
+        def sneaky_actor_exec(obj, x):
+            return ray.get(obj["actor"].exec.remote(obj["f"], x))
+
+        def local_fact(x):
+            if x <= 0:
+                return 1
+            return x * local_fact(x - 1)
+
+        assert ray.get(fact.remote(7)) == local_fact(7)
+        assert ray.get(func_exec.remote(fact, 8)) == local_fact(8)
+        test_obj = {}
+        test_obj["f"] = fact
+        assert ray.get(sneaky_func_exec.remote(test_obj, 5)) == local_fact(5)
+        actor_handle = ExecActor.remote()
+        assert ray.get(actor_handle.exec.remote(fact, 7)) == local_fact(7)
+        assert ray.get(func_actor_exec.remote(actor_handle, fact,
+                                              10)) == local_fact(10)
+        second_actor = ExecActor.remote()
+        assert ray.get(actor_handle.exec_exec.remote(second_actor, fact,
+                                                     9)) == local_fact(9)
+        test_actor_obj = {}
+        test_actor_obj["actor"] = second_actor
+        test_actor_obj["f"] = fact
+        assert ray.get(sneaky_actor_exec.remote(test_actor_obj,
+                                                4)) == local_fact(4)
+
+
 if __name__ == "__main__":
    import sys
    sys.exit(pytest.main(["-v", __file__]))
@@ -0,0 +1,25 @@
+from ray.tests.test_experimental_client import ray_start_client_server
+
+
+def test_get_ray_metadata(ray_start_regular_shared):
+    """
+    Test the ClusterInfo client data pathway and API surface
+    """
+    with ray_start_client_server() as ray:
+        ip_address = ray_start_regular_shared["node_ip_address"]
+
+        initialized = ray.is_initialized()
+        assert initialized
+
+        nodes = ray.nodes()
+        assert len(nodes) == 1, nodes
+        assert nodes[0]["NodeManagerAddress"] == ip_address
+
+        current_node_id = "node:" + ip_address
+
+        cluster_resources = ray.cluster_resources()
+        available_resources = ray.available_resources()
+
+        assert cluster_resources["CPU"] == 1.0
+        assert current_node_id in cluster_resources
+        assert current_node_id in available_resources
--- a/Show More
+++ b/Show More