Files
ray/src/ray/raylet/task_dependency_manager.h
T
Stephanie Wang d95c8b8a41 [core][new scheduler] Move tasks from ready to dispatch to waiting on argument eviction (#13048)
* Add index for tasks to dispatch

* Task dependency manager interface

* Unsubscribe dependencies and tests

* NodeManager

* Revert "Add index for tasks to dispatch"

This reverts commit c6ccb9aa306e00f80d34b991055e4e83872595ea.

* tmp

* Move back to waiting if args not ready

* update
2020-12-23 09:33:43 -08:00

261 lines
12 KiB
C++

// Copyright 2017 The Ray Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
// clang-format off
#include "ray/common/id.h"
#include "ray/common/task/task.h"
#include "ray/object_manager/object_manager.h"
#include "ray/raylet/reconstruction_policy.h"
// clang-format on
namespace ray {
namespace raylet {
using rpc::TaskLeaseData;
class ReconstructionPolicy;
/// Used for unit-testing the ClusterTaskManager, which calls these methods for
/// locally queued tasks that have dependencies.
class TaskDependencyManagerInterface {
public:
virtual bool SubscribeGetDependencies(
const TaskID &task_id,
const std::vector<rpc::ObjectReference> &required_objects) = 0;
virtual bool IsTaskReady(const TaskID &task_id) const = 0;
virtual bool UnsubscribeGetDependencies(const TaskID &task_id) = 0;
virtual ~TaskDependencyManagerInterface() {}
};
/// \class TaskDependencyManager
///
/// Responsible for managing object dependencies for tasks. The caller can
/// subscribe to object dependencies for a task. The task manager will
/// determine which object dependencies are remote. These are the objects that
/// are neither in the local object store, nor will they be created by a
/// locally queued task. The task manager will request that these objects be
/// made available locally, either by object transfer from a remote node or
/// reconstruction. The task manager will also cancel these objects if they are
/// no longer needed by any task.
class TaskDependencyManager : public TaskDependencyManagerInterface {
public:
/// Create a task dependency manager.
TaskDependencyManager(ObjectManagerInterface &object_manager,
ReconstructionPolicyInterface &reconstruction_policy);
/// Check whether an object is locally available.
///
/// \param object_id The object to check for.
/// \return Whether the object is local.
bool CheckObjectLocal(const ObjectID &object_id) const;
/// Subscribe to object depedencies required by the task and check whether
/// all dependencies are fulfilled. This should be called for task arguments and
/// `ray.get` calls during task execution.
///
/// The TaskDependencyManager will track the task's dependencies
/// until UnsubscribeGetDependencies is called on the same task ID. If any
/// dependencies are remote, then they will be requested. When the last
/// remote dependency later appears locally via a call to HandleObjectLocal,
/// the subscribed task will be returned by the HandleObjectLocal call,
/// signifying that it is ready to run. This method may be called multiple
/// times per task.
///
/// \param task_id The ID of the task whose dependencies to subscribe to.
/// \param required_objects The objects required by the task.
/// \return Whether all of the given dependencies for the given task are
/// local.
bool SubscribeGetDependencies(
const TaskID &task_id, const std::vector<rpc::ObjectReference> &required_objects);
/// Check whether a task is ready to run. The task ID must
/// have been previously subscribed by the caller.
///
/// \param task_id The ID of the task to check.
/// \return Whether all of the dependencies for the task are
/// local.
bool IsTaskReady(const TaskID &task_id) const;
/// Subscribe to object depedencies required by the worker. This should be called for
/// ray.wait calls during task execution.
///
/// The TaskDependencyManager will track all remote dependencies until the
/// dependencies are local, or until UnsubscribeWaitDependencies is called
/// with the same worker ID, whichever occurs first. Remote dependencies will
/// be requested. This method may be called multiple times per worker on the
/// same objects.
///
/// \param worker_id The ID of the worker that called `ray.wait`.
/// \param required_objects The objects required by the worker.
/// \return Void.
void SubscribeWaitDependencies(
const WorkerID &worker_id,
const std::vector<rpc::ObjectReference> &required_objects);
/// Unsubscribe from the object dependencies required by this task through the task
/// arguments or `ray.get`. If the objects were remote and are no longer required by any
/// subscribed task, then they will be canceled.
///
/// \param task_id The ID of the task whose dependencies we should unsubscribe from.
/// \return Whether the task was subscribed before.
bool UnsubscribeGetDependencies(const TaskID &task_id);
/// Unsubscribe from the object dependencies required by this worker through `ray.wait`.
/// If the objects were remote and are no longer required by any subscribed task, then
/// they will be canceled.
///
/// \param worker_id The ID of the worker whose dependencies we should unsubscribe from.
/// \return The objects that the worker was waiting on.
void UnsubscribeWaitDependencies(const WorkerID &worker_id);
/// Mark that the given task is pending execution. Any objects that it creates
/// are now considered to be pending creation. If there are any subscribed
/// tasks that depend on these objects, then the objects will be canceled.
///
/// \param task The task that is pending execution.
void TaskPending(const Task &task);
/// Mark that the given task is no longer pending execution. Any objects that
/// it creates that are not already local are now considered to be remote. If
/// there are any subscribed tasks that depend on these objects, then the
/// objects will be requested.
///
/// \param task_id The ID of the task to cancel.
void TaskCanceled(const TaskID &task_id);
/// Handle an object becoming locally available. If there are any subscribed
/// tasks that depend on this object, then the object will be canceled.
///
/// \param object_id The object ID of the object to mark as locally
/// available.
/// \return A list of task IDs. This contains all subscribed tasks that now
/// have all of their dependencies fulfilled, once this object was made
/// local.
std::vector<TaskID> HandleObjectLocal(const ray::ObjectID &object_id);
/// Handle an object that is no longer locally available. If there are any
/// subscribed tasks that depend on this object, then the object will be
/// requested.
///
/// \param object_id The object ID of the object that was previously locally
/// available.
/// \return A list of task IDs. This contains all subscribed tasks that
/// previously had all of their dependencies fulfilled, but are now missing
/// this object dependency.
std::vector<TaskID> HandleObjectMissing(const ray::ObjectID &object_id);
/// Remove all of the tasks specified. These tasks will no longer be
/// considered pending and the objects they depend on will no longer be
/// required.
///
/// \param task_ids The collection of task IDs. For a given task in this set,
/// all tasks that depend on the task must also be included in the set.
void RemoveTasksAndRelatedObjects(const std::unordered_set<TaskID> &task_ids);
/// Returns debug string for class.
///
/// \return string.
std::string DebugString() const;
/// Get the address of the owner of this object. An address will only be
/// returned if the caller previously specified that this object is required
/// on this node, through a call to SubscribeGetDependencies or
/// SubscribeWaitDependencies.
///
/// \param[in] object_id The object whose owner to get.
/// \param[out] owner_address The address of the object's owner, if
/// available.
/// \return True if we have owner information for the object.
bool GetOwnerAddress(const ObjectID &object_id, rpc::Address *owner_address) const;
private:
struct ObjectDependencies {
ObjectDependencies(const rpc::ObjectReference &ref)
: owner_address(ref.owner_address()) {}
/// The tasks that depend on this object, either because the object is a task argument
/// or because the task called `ray.get` on the object.
std::unordered_set<TaskID> dependent_tasks;
/// The workers that depend on this object because they called `ray.wait` on the
/// object.
std::unordered_set<WorkerID> dependent_workers;
/// The address of the worker that owns this object.
rpc::Address owner_address;
bool Empty() const { return dependent_tasks.empty() && dependent_workers.empty(); }
};
/// A struct to represent the object dependencies of a task.
struct TaskDependencies {
/// The objects that the task depends on. These are either the arguments to
/// the task or objects that the task calls `ray.get` on. These must be
/// local before the task is ready to execute. Objects are removed from
/// this set once UnsubscribeGetDependencies is called.
std::unordered_set<ObjectID> get_dependencies;
/// The number of object arguments that are not available locally. This
/// must be zero before the task is ready to execute.
int64_t num_missing_get_dependencies;
};
/// The objects that the worker is fetching. These are objects that a task that executed
/// or is executing on the worker called `ray.wait` on that are not yet local. An object
/// will be automatically removed from this set once it becomes local.
using WorkerDependencies = std::unordered_set<ObjectID>;
/// Check whether the given object needs to be made available through object
/// transfer or reconstruction. These are objects for which: (1) there is a
/// subscribed task dependent on it, (2) the object is not local, and (3) the
/// task that creates the object is not pending execution locally.
bool CheckObjectRequired(const ObjectID &object_id, rpc::Address *owner_address) const;
/// If the given object is required, then request that the object be made
/// available through object transfer or reconstruction.
void HandleRemoteDependencyRequired(const ObjectID &object_id);
/// If the given object is no longer required, then cancel any in-progress
/// operations to make the object available through object transfer or
/// reconstruction.
void HandleRemoteDependencyCanceled(const ObjectID &object_id);
/// The object manager, used to fetch required objects from remote nodes.
ObjectManagerInterface &object_manager_;
/// The reconstruction policy, used to reconstruct required objects that no
/// longer exist on any live nodes.
ReconstructionPolicyInterface &reconstruction_policy_;
/// A mapping from task ID of each subscribed task to its list of object
/// dependencies, either task arguments or objects passed into `ray.get`.
std::unordered_map<ray::TaskID, TaskDependencies> task_dependencies_;
/// A mapping from worker ID to each object that the worker called `ray.wait` on.
std::unordered_map<ray::WorkerID, WorkerDependencies> worker_dependencies_;
/// All tasks whose outputs are required by a subscribed task. This is a
/// mapping from task ID to information about the objects that the task
/// creates, either by return value or by `ray.put`. For each object, we
/// store the IDs of the subscribed tasks that are dependent on the object.
std::unordered_map<ray::TaskID, std::unordered_map<ObjectID, ObjectDependencies>>
required_tasks_;
/// Objects that are required by a subscribed task, are not local, and are
/// not created by a pending task. For these objects, there are pending
/// operations to make the object available.
std::unordered_set<ray::ObjectID> required_objects_;
/// The set of locally available objects.
std::unordered_set<ray::ObjectID> local_objects_;
/// The set of tasks that are pending execution. Any objects created by these
/// tasks that are not already local are pending creation.
std::unordered_set<ray::TaskID> pending_tasks_;
};
} // namespace raylet
} // namespace ray