Revert "Use Boost.Process instead of pid_t (#6510)" (#6909)

This reverts commit fb8e3615d5.
This commit is contained in:
mehrdadn
2020-01-26 08:26:44 -08:00
committed by Edward Oakes
parent 2fca550096
commit 38ec2e7052
11 changed files with 203 additions and 333 deletions
-1
View File
@@ -399,7 +399,6 @@ cc_library(
":stats_lib",
":worker_rpc",
"@boost//:asio",
"@boost//:process",
"@com_github_jupp0r_prometheus_cpp//pull",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/container:flat_hash_set",
-2
View File
@@ -123,8 +123,6 @@ def ray_deps_setup():
# Backport Clang-Cl patch on Boost 1.69 to Boost <= 1.68:
# https://lists.boost.org/Archives/boost/2018/09/243420.php
"//thirdparty/patches:boost-type_traits-trivial_move.patch",
# Partially backport waitpid() patch on Boost 1.72 to Boost <= 1.68
"//thirdparty/patches:boost-process-teminate-waitpid-nohang.patch",
],
)
+3 -3
View File
@@ -39,7 +39,7 @@ def wait_for_pid_to_exit(pid, timeout=20):
return
time.sleep(0.1)
raise RayTestTimeoutException(
"Timed out while waiting for process {} to exit.".format(pid))
"Timed out while waiting for process to exit.")
def wait_for_children_of_pid(pid, num_children=1, timeout=20):
@@ -51,8 +51,8 @@ def wait_for_children_of_pid(pid, num_children=1, timeout=20):
return
time.sleep(0.1)
raise RayTestTimeoutException(
"Timed out while waiting for process {} children to start "
"({}/{} started).".format(pid, num_alive, num_children))
"Timed out while waiting for process children to start "
"({}/{} started).".format(num_alive, num_children))
def wait_for_children_of_pid_to_exit(pid, timeout=20):
+27 -34
View File
@@ -89,9 +89,8 @@ NodeManager::NodeManager(boost::asio::io_service &io_service,
object_manager_profile_timer_(io_service),
initial_config_(config),
local_available_resources_(config.resource_config),
worker_pool_(io_service, config.num_initial_workers,
config.maximum_startup_concurrency, gcs_client_,
config.worker_commands),
worker_pool_(config.num_initial_workers, config.maximum_startup_concurrency,
gcs_client_, config.worker_commands),
scheduling_policy_(local_queues_),
reconstruction_policy_(
io_service_,
@@ -229,23 +228,22 @@ void NodeManager::HandleUnexpectedWorkerFailure(
}
void NodeManager::KillWorker(std::shared_ptr<Worker> worker) {
#ifdef _WIN32
// TODO(mehrdadn): Implement implement graceful process termination mechanism
#else
// If we're just cleaning up a single worker, allow it some time to clean
// up its state before force killing. The client socket will be closed
// and the worker struct will be freed after the timeout.
kill(worker->Process().get()->id(), SIGTERM);
#endif
kill(worker->Pid(), SIGTERM);
auto retry_timer = std::make_shared<boost::asio::deadline_timer>(io_service_);
auto retry_duration = boost::posix_time::milliseconds(
RayConfig::instance().kill_worker_timeout_milliseconds());
retry_timer->expires_from_now(retry_duration);
retry_timer->async_wait([retry_timer, worker](const boost::system::error_code &error) {
RAY_LOG(DEBUG) << "Send SIGKILL to worker, pid=" << worker->Process().get()->id();
// Force kill worker
worker->Process().get()->terminate();
RAY_LOG(DEBUG) << "Send SIGKILL to worker, pid=" << worker->Pid();
// Force kill worker. TODO(mehrdadn, rkn): The worker may have already died
// and had its PID reassigned to a different process, at least on Windows.
// On Linux, this may or may not be the case, depending on e.g. whether
// the process has been already waited on. Regardless, this must be fixed.
kill(worker->Pid(), SIGKILL);
});
}
@@ -857,9 +855,8 @@ void NodeManager::ProcessClientMessage(
RAY_LOG(DEBUG) << "[Worker] Message "
<< protocol::EnumNameMessageType(message_type_value) << "("
<< message_type << ") from worker with PID "
<< (registered_worker
? std::to_string(registered_worker->Process().get()->id())
: "nil");
<< (registered_worker ? std::to_string(registered_worker->Pid())
: "nil");
if (registered_worker && registered_worker->IsDead()) {
// For a worker that is marked as dead (because the job has died already),
// all the messages are ignored except DisconnectClient.
@@ -966,6 +963,12 @@ void NodeManager::ProcessClientMessage(
void NodeManager::ProcessRegisterClientRequestMessage(
const std::shared_ptr<LocalClientConnection> &client, const uint8_t *message_data) {
client->Register();
auto message = flatbuffers::GetRoot<protocol::RegisterClientRequest>(message_data);
Language language = static_cast<Language>(message->language());
WorkerID worker_id = from_flatbuf<WorkerID>(*message->worker_id());
auto worker = std::make_shared<Worker>(worker_id, message->worker_pid(), language,
message->port(), client, client_call_manager_);
Status status;
flatbuffers::FlatBufferBuilder fbb;
auto reply =
ray::protocol::CreateRegisterClientReply(fbb, to_flatbuf(fbb, self_node_id_));
@@ -980,31 +983,24 @@ void NodeManager::ProcessRegisterClientRequestMessage(
}
});
auto message = flatbuffers::GetRoot<protocol::RegisterClientRequest>(message_data);
Language language = static_cast<Language>(message->language());
WorkerID worker_id = from_flatbuf<WorkerID>(*message->worker_id());
pid_t pid = message->worker_pid();
auto worker = std::make_shared<Worker>(worker_id, language, message->port(), client,
client_call_manager_);
if (message->is_worker()) {
// Register the new worker.
if (worker_pool_.RegisterWorker(worker, pid).ok()) {
if (worker_pool_.RegisterWorker(worker).ok()) {
HandleWorkerAvailable(worker->Connection());
}
} else {
// Register the new driver.
worker->SetProcess(ProcessHandle::FromPid(pid));
const JobID job_id = from_flatbuf<JobID>(*message->job_id());
// Compute a dummy driver task id from a given driver.
const TaskID driver_task_id = TaskID::ComputeDriverTaskId(worker_id);
worker->AssignTaskId(driver_task_id);
worker->AssignJobId(job_id);
Status status = worker_pool_.RegisterDriver(worker);
status = worker_pool_.RegisterDriver(worker);
if (status.ok()) {
local_queues_.AddDriverTaskId(driver_task_id);
auto job_data_ptr =
gcs::CreateJobTableData(job_id, /*is_dead*/ false, std::time(nullptr),
initial_config_.node_manager_address, pid);
auto job_data_ptr = gcs::CreateJobTableData(
job_id, /*is_dead*/ false, std::time(nullptr),
initial_config_.node_manager_address, message->worker_pid());
RAY_CHECK_OK(gcs_client_->Jobs().AsyncAdd(job_data_ptr, nullptr));
}
}
@@ -1200,8 +1196,7 @@ void NodeManager::ProcessDisconnectClientMessage(
cluster_resource_map_[self_node_id_].Release(lifetime_resources.ToResourceSet());
worker->ResetLifetimeResourceIds();
RAY_LOG(DEBUG) << "Worker (pid=" << worker->Process().get()->id()
<< ") is disconnected. "
RAY_LOG(DEBUG) << "Worker (pid=" << worker->Pid() << ") is disconnected. "
<< "job_id: " << worker->GetAssignedJobId();
// Since some resources may have been released, we can try to dispatch more tasks.
@@ -1215,8 +1210,7 @@ void NodeManager::ProcessDisconnectClientMessage(
local_queues_.RemoveDriverTaskId(TaskID::ComputeDriverTaskId(driver_id));
worker_pool_.DisconnectDriver(worker);
RAY_LOG(DEBUG) << "Driver (pid=" << worker->Process().get()->id()
<< ") is disconnected. "
RAY_LOG(DEBUG) << "Driver (pid=" << worker->Pid() << ") is disconnected. "
<< "job_id: " << job_id;
}
@@ -2296,8 +2290,7 @@ void NodeManager::AssignTask(const std::shared_ptr<Worker> &worker, const Task &
}
RAY_LOG(DEBUG) << "Assigning task " << spec.TaskId() << " to worker with pid "
<< worker->Process().get()->id()
<< ", worker id: " << worker->WorkerId();
<< worker->Pid() << ", worker id: " << worker->WorkerId();
flatbuffers::FlatBufferBuilder fbb;
// Resource accounting: acquire resources for the assigned task.
@@ -3128,7 +3121,7 @@ void NodeManager::HandleGetNodeStats(const rpc::GetNodeStatsRequest &request,
rpc::SendReplyCallback send_reply_callback) {
for (const auto &driver : worker_pool_.GetAllDrivers()) {
auto worker_stats = reply->add_workers_stats();
worker_stats->set_pid(driver->Process().get()->id());
worker_stats->set_pid(driver->Pid());
worker_stats->set_is_driver(true);
}
for (const auto task : local_queues_.GetTasks(TaskState::INFEASIBLE)) {
@@ -3191,7 +3184,7 @@ void NodeManager::HandleGetNodeStats(const rpc::GetNodeStatsRequest &request,
<< status.ToString();
} else {
auto worker_stats = reply->add_workers_stats();
worker_stats->set_pid(worker->Process().get()->id());
worker_stats->set_pid(worker->Pid());
worker_stats->set_is_driver(false);
reply->set_num_workers(reply->num_workers() + 1);
worker_stats->mutable_core_worker_stats()->MergeFrom(r.core_worker_stats());
+3 -7
View File
@@ -12,10 +12,11 @@ namespace ray {
namespace raylet {
/// A constructor responsible for initializing the state of a worker.
Worker::Worker(const WorkerID &worker_id, const Language &language, int port,
Worker::Worker(const WorkerID &worker_id, pid_t pid, const Language &language, int port,
std::shared_ptr<LocalClientConnection> connection,
rpc::ClientCallManager &client_call_manager)
: worker_id_(worker_id),
pid_(pid),
language_(language),
port_(port),
connection_(connection),
@@ -41,12 +42,7 @@ bool Worker::IsBlocked() const { return blocked_; }
WorkerID Worker::WorkerId() const { return worker_id_; }
ProcessHandle Worker::Process() const { return proc_; }
void Worker::SetProcess(const ProcessHandle &proc) {
RAY_CHECK(!proc_); // this procedure should not be called multiple times
proc_ = proc;
}
pid_t Worker::Pid() const { return pid_; }
Language Worker::GetLanguage() const { return language_; }
+7 -8
View File
@@ -9,7 +9,8 @@
#include "ray/common/task/task.h"
#include "ray/common/task/task_common.h"
#include "ray/rpc/worker/core_worker_client.h"
#include "ray/util/process.h"
#include <unistd.h> // pid_t
namespace ray {
@@ -21,8 +22,7 @@ namespace raylet {
class Worker {
public:
/// A constructor that initializes a worker object.
/// NOTE: You MUST manually set the worker process.
Worker(const WorkerID &worker_id, const Language &language, int port,
Worker(const WorkerID &worker_id, pid_t pid, const Language &language, int port,
std::shared_ptr<LocalClientConnection> connection,
rpc::ClientCallManager &client_call_manager);
/// A destructor responsible for freeing all worker state.
@@ -34,9 +34,8 @@ class Worker {
bool IsBlocked() const;
/// Return the worker's ID.
WorkerID WorkerId() const;
/// Return the worker process.
ProcessHandle Process() const;
void SetProcess(const ProcessHandle &proc);
/// Return the worker's PID.
pid_t Pid() const;
Language GetLanguage() const;
int Port() const;
void AssignTaskId(const TaskID &task_id);
@@ -80,8 +79,8 @@ class Worker {
private:
/// The worker's ID.
WorkerID worker_id_;
/// The worker's process.
ProcessHandle proc_;
/// The worker's PID.
pid_t pid_;
/// The language type of this worker.
Language language_;
/// Port that this worker listens on.
+114 -63
View File
@@ -1,14 +1,14 @@
#include "ray/raylet/worker_pool.h"
#ifdef _WIN32
#include <Windows.h>
#include <process.h>
#endif
#include <sys/wait.h>
#include <algorithm>
#include <boost/asio/io_service.hpp>
#include <boost/process/args.hpp>
#include <boost/process/async.hpp>
#include <boost/process/search_path.hpp>
#include "ray/common/constants.h"
#include "ray/common/ray_config.h"
#include "ray/common/status.h"
@@ -46,14 +46,19 @@ namespace raylet {
/// A constructor that initializes a worker pool with num_workers workers for
/// each language.
WorkerPool::WorkerPool(boost::asio::io_service &io_service, int num_workers,
int maximum_startup_concurrency,
WorkerPool::WorkerPool(int num_workers, int maximum_startup_concurrency,
std::shared_ptr<gcs::GcsClient> gcs_client,
const WorkerCommandMap &worker_commands)
: io_service_(&io_service),
maximum_startup_concurrency_(maximum_startup_concurrency),
: maximum_startup_concurrency_(maximum_startup_concurrency),
gcs_client_(std::move(gcs_client)) {
RAY_CHECK(maximum_startup_concurrency > 0);
#ifdef _WIN32
// TODO(mehrdadn): Is there an equivalent of this we need for Windows?
#else
// Ignore SIGCHLD signals. If we don't do this, then worker processes will
// become zombies instead of dying gracefully.
signal(SIGCHLD, SIG_IGN);
#endif
for (const auto &entry : worker_commands) {
// Initialize the pool state for this language.
auto &state = states_by_lang_[entry.first];
@@ -95,21 +100,25 @@ void WorkerPool::Start(int num_workers) {
}
WorkerPool::~WorkerPool() {
std::unordered_set<ProcessHandle> procs_to_kill;
std::unordered_set<pid_t> pids_to_kill;
for (const auto &entry : states_by_lang_) {
// Kill all registered workers. NOTE(swang): This assumes that the registered
// workers were started by the pool.
for (const auto &worker : entry.second.registered_workers) {
procs_to_kill.insert(worker->Process());
pids_to_kill.insert(worker->Pid());
}
// Kill all the workers that have been started but not registered.
for (const auto &starting_worker : entry.second.starting_worker_processes) {
procs_to_kill.insert(starting_worker.first);
pids_to_kill.insert(starting_worker.first);
}
}
for (const auto &proc : procs_to_kill) {
proc.get()->terminate();
proc.get()->wait();
for (const auto &pid : pids_to_kill) {
RAY_CHECK(pid > 0);
kill(pid, SIGKILL);
}
// Waiting for the workers to be killed
for (const auto &pid : pids_to_kill) {
waitpid(pid, NULL, 0);
}
}
@@ -123,8 +132,8 @@ uint32_t WorkerPool::Size(const Language &language) const {
}
}
ProcessHandle WorkerPool::StartWorkerProcess(
const Language &language, const std::vector<std::string> &dynamic_options) {
int WorkerPool::StartWorkerProcess(const Language &language,
const std::vector<std::string> &dynamic_options) {
auto &state = GetStateForLanguage(language);
// If we are already starting up too many workers, then return without starting
// more.
@@ -137,7 +146,7 @@ ProcessHandle WorkerPool::StartWorkerProcess(
RAY_LOG(DEBUG) << "Worker not started, " << starting_workers
<< " workers of language type " << static_cast<int>(language)
<< " pending registration";
return ProcessHandle();
return -1;
}
// Either there are no workers pending registration or the worker start is being forced.
RAY_LOG(DEBUG) << "Starting new worker process, current pool has "
@@ -185,16 +194,71 @@ ProcessHandle WorkerPool::StartWorkerProcess(
<< Language_Name(language) << " worker process. But the "
<< kWorkerNumWorkersPlaceholder << "placeholder is not found in worker command.";
ProcessHandle proc = StartProcess(worker_command_args);
RAY_CHECK(proc);
RAY_LOG(DEBUG) << "Started worker process of " << workers_to_start
<< " worker(s) with pid " << proc.get()->id();
state.starting_worker_processes.emplace(proc, workers_to_start);
return proc;
pid_t pid = StartProcess(worker_command_args);
if (pid < 0) {
// Failure case.
RAY_LOG(FATAL) << "Failed to fork worker process: " << strerror(errno);
} else if (pid > 0) {
// Parent process case.
RAY_LOG(DEBUG) << "Started worker process of " << workers_to_start
<< " worker(s) with pid " << pid;
state.starting_worker_processes.emplace(pid, workers_to_start);
return pid;
}
return -1;
}
ProcessHandle WorkerPool::StartProcess(
const std::vector<std::string> &worker_command_args) {
#ifdef _WIN32
// Fork + exec combo for Windows. Returns -1 on failure.
// TODO(mehrdadn): This is dangerous on Windows.
// We need to keep the actual process handle alive for the PID to stay valid.
// Make this change as soon as possible, or the PID may refer to the wrong process.
static pid_t spawnvp_wrapper(std::vector<std::string> const &args) {
pid_t pid;
std::vector<const char *> str_args;
for (const auto &arg : args) {
str_args.push_back(arg.c_str());
}
str_args.push_back(NULL);
HANDLE handle = (HANDLE)spawnvp(P_NOWAIT, str_args[0], str_args.data());
if (handle != INVALID_HANDLE_VALUE) {
pid = static_cast<pid_t>(GetProcessId(handle));
if (pid == 0) {
pid = -1;
}
CloseHandle(handle);
} else {
pid = -1;
errno = EINVAL;
}
return pid;
}
#else
// Fork + exec combo for POSIX. Returns -1 on failure.
static pid_t spawnvp_wrapper(std::vector<std::string> const &args) {
pid_t pid;
std::vector<const char *> str_args;
for (const auto &arg : args) {
str_args.push_back(arg.c_str());
}
str_args.push_back(NULL);
pid = fork();
if (pid == 0) {
// Child process case.
// Reset the SIGCHLD handler for the worker.
// TODO(mehrdadn): Move any work here to the child process itself
// so that it can also be implemented on Windows.
signal(SIGCHLD, SIG_DFL);
if (execvp(str_args[0], const_cast<char *const *>(str_args.data())) == -1) {
pid = -1;
abort(); // fork() succeeded but exec() failed, so abort the child
}
}
return pid;
}
#endif
pid_t WorkerPool::StartProcess(const std::vector<std::string> &worker_command_args) {
if (RAY_LOG_ENABLED(DEBUG)) {
std::stringstream stream;
stream << "Starting worker process with command:";
@@ -205,35 +269,25 @@ ProcessHandle WorkerPool::StartProcess(
}
// Launch the process to create the worker.
auto exit_callback = [=](int, const std::error_code &ec) {
// This callback seems to be necessary for proper zombie cleanup.
// However, it doesn't need to do anything.
};
std::error_code ec;
ProcessHandle child(
std::make_shared<Process>(boost::process::args(worker_command_args), *io_service_,
boost::process::on_exit = exit_callback, ec));
if (!child.get()->valid()) {
child = ProcessHandle();
pid_t pid = spawnvp_wrapper(worker_command_args);
if (pid == -1) {
RAY_LOG(FATAL) << "Failed to start worker with error " << errno << ": "
<< strerror(errno);
}
if (!child || !child.get()->valid() || ec) {
// The worker failed to start. This is a fatal error.
RAY_LOG(FATAL) << "Failed to start worker with return value " << ec << ": "
<< ec.message();
}
return child;
return pid;
}
Status WorkerPool::RegisterWorker(const std::shared_ptr<Worker> &worker, pid_t pid) {
Status WorkerPool::RegisterWorker(const std::shared_ptr<Worker> &worker) {
const auto pid = worker->Pid();
const auto port = worker->Port();
RAY_LOG(DEBUG) << "Registering worker with pid " << pid << ", port: " << port;
auto &state = GetStateForLanguage(worker->GetLanguage());
auto it = state.starting_worker_processes.find(ProcessHandle::FromPid(pid));
auto it = state.starting_worker_processes.find(pid);
if (it == state.starting_worker_processes.end()) {
RAY_LOG(WARNING) << "Received a register request from an unknown worker " << pid;
return Status::Invalid("Unknown worker");
}
worker->SetProcess(it->first);
it->second--;
if (it->second == 0) {
state.starting_worker_processes.erase(it);
@@ -278,7 +332,7 @@ void WorkerPool::PushWorker(const std::shared_ptr<Worker> &worker) {
<< "Idle workers cannot have an assigned task ID";
auto &state = GetStateForLanguage(worker->GetLanguage());
auto it = state.dedicated_workers_to_tasks.find(worker->Process());
auto it = state.dedicated_workers_to_tasks.find(worker->Pid());
if (it != state.dedicated_workers_to_tasks.end()) {
// The worker is used for the actor creation task with dynamic options.
// Put it into idle dedicated worker pool.
@@ -299,7 +353,7 @@ std::shared_ptr<Worker> WorkerPool::PopWorker(const TaskSpecification &task_spec
auto &state = GetStateForLanguage(task_spec.GetLanguage());
std::shared_ptr<Worker> worker = nullptr;
ProcessHandle proc;
int pid = -1;
if (task_spec.IsActorCreationTask() && !task_spec.DynamicWorkerOptions().empty()) {
// Code path of actor creation task with dynamic worker options.
// Try to pop it from idle dedicated pool.
@@ -310,16 +364,15 @@ std::shared_ptr<Worker> WorkerPool::PopWorker(const TaskSpecification &task_spec
state.idle_dedicated_workers.erase(it);
// Because we found a worker that can perform this task,
// we can remove it from dedicated_workers_to_tasks.
state.dedicated_workers_to_tasks.erase(worker->Process());
state.dedicated_workers_to_tasks.erase(worker->Pid());
state.tasks_to_dedicated_workers.erase(task_spec.TaskId());
} else if (!HasPendingWorkerForTask(task_spec.GetLanguage(), task_spec.TaskId())) {
// We are not pending a registration from a worker for this task,
// so start a new worker process for this task.
proc =
StartWorkerProcess(task_spec.GetLanguage(), task_spec.DynamicWorkerOptions());
if (proc) {
state.dedicated_workers_to_tasks[proc] = task_spec.TaskId();
state.tasks_to_dedicated_workers[task_spec.TaskId()] = proc;
pid = StartWorkerProcess(task_spec.GetLanguage(), task_spec.DynamicWorkerOptions());
if (pid > 0) {
state.dedicated_workers_to_tasks[pid] = task_spec.TaskId();
state.tasks_to_dedicated_workers[task_spec.TaskId()] = pid;
}
}
} else if (!task_spec.IsActorTask()) {
@@ -330,7 +383,7 @@ std::shared_ptr<Worker> WorkerPool::PopWorker(const TaskSpecification &task_spec
} else {
// There are no more non-actor workers available to execute this task.
// Start a new worker process.
proc = StartWorkerProcess(task_spec.GetLanguage());
pid = StartWorkerProcess(task_spec.GetLanguage());
}
} else {
// Code path of actor task.
@@ -342,7 +395,7 @@ std::shared_ptr<Worker> WorkerPool::PopWorker(const TaskSpecification &task_spec
}
}
if (worker == nullptr && proc) {
if (worker == nullptr && pid > 0) {
WarnAboutSize();
}
@@ -355,7 +408,7 @@ bool WorkerPool::DisconnectWorker(const std::shared_ptr<Worker> &worker) {
stats::CurrentWorker().Record(
0, {{stats::LanguageKey, Language_Name(worker->GetLanguage())},
{stats::WorkerPidKey, std::to_string(worker->Process().get()->id())}});
{stats::WorkerPidKey, std::to_string(worker->Pid())}});
return RemoveWorker(state.idle, worker);
}
@@ -365,7 +418,7 @@ void WorkerPool::DisconnectDriver(const std::shared_ptr<Worker> &driver) {
RAY_CHECK(RemoveWorker(state.registered_drivers, driver));
stats::CurrentDriver().Record(
0, {{stats::LanguageKey, Language_Name(driver->GetLanguage())},
{stats::WorkerPidKey, std::to_string(driver->Process().get()->id())}});
{stats::WorkerPidKey, std::to_string(driver->Pid())}});
}
inline WorkerPool::State &WorkerPool::GetStateForLanguage(const Language &language) {
@@ -481,17 +534,15 @@ void WorkerPool::RecordMetrics() const {
// Record worker.
for (auto worker : entry.second.registered_workers) {
stats::CurrentWorker().Record(
worker->Process().get()->id(),
{{stats::LanguageKey, Language_Name(worker->GetLanguage())},
{stats::WorkerPidKey, std::to_string(worker->Process().get()->id())}});
worker->Pid(), {{stats::LanguageKey, Language_Name(worker->GetLanguage())},
{stats::WorkerPidKey, std::to_string(worker->Pid())}});
}
// Record driver.
for (auto driver : entry.second.registered_drivers) {
stats::CurrentDriver().Record(
driver->Process().get()->id(),
{{stats::LanguageKey, Language_Name(driver->GetLanguage())},
{stats::WorkerPidKey, std::to_string(driver->Process().get()->id())}});
driver->Pid(), {{stats::LanguageKey, Language_Name(driver->GetLanguage())},
{stats::WorkerPidKey, std::to_string(driver->Pid())}});
}
}
}
+10 -17
View File
@@ -5,9 +5,6 @@
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <boost/asio/io_service.hpp>
#include "gtest/gtest.h"
#include "ray/common/client_connection.h"
@@ -43,8 +40,8 @@ class WorkerPool {
/// resources on the machine).
/// \param worker_commands The commands used to start the worker process, grouped by
/// language.
WorkerPool(boost::asio::io_service &io_service, int num_workers,
int maximum_startup_concurrency, std::shared_ptr<gcs::GcsClient> gcs_client,
WorkerPool(int num_workers, int maximum_startup_concurrency,
std::shared_ptr<gcs::GcsClient> gcs_client,
const WorkerCommandMap &worker_commands);
/// Destructor responsible for freeing a set of workers owned by this class.
@@ -55,7 +52,7 @@ class WorkerPool {
///
/// \param The Worker to be registered.
/// \return If the registration is successful.
Status RegisterWorker(const std::shared_ptr<Worker> &worker, pid_t pid);
Status RegisterWorker(const std::shared_ptr<Worker> &worker);
/// Register a new driver.
///
@@ -160,16 +157,14 @@ class WorkerPool {
/// \param dynamic_options The dynamic options that we should add for worker command.
/// \return The id of the process that we started if it's positive,
/// otherwise it means we didn't start a process.
ProcessHandle StartWorkerProcess(const Language &language,
const std::vector<std::string> &dynamic_options = {});
int StartWorkerProcess(const Language &language,
const std::vector<std::string> &dynamic_options = {});
/// The implementation of how to start a new worker process with command arguments.
/// The lifetime of the process is tied to that of the returned object,
/// unless the caller manually detaches the process after the call.
///
/// \param worker_command_args The command arguments of new worker process.
/// \return An object representing the started worker process.
virtual ProcessHandle StartProcess(const std::vector<std::string> &worker_command_args);
/// \return The process ID of started worker process.
virtual pid_t StartProcess(const std::vector<std::string> &worker_command_args);
/// Push an warning message to user if worker pool is getting to big.
virtual void WarnAboutSize();
@@ -194,12 +189,12 @@ class WorkerPool {
std::unordered_set<std::shared_ptr<Worker>> registered_drivers;
/// A map from the pids of starting worker processes
/// to the number of their unregistered workers.
std::unordered_map<ProcessHandle, int> starting_worker_processes;
std::unordered_map<pid_t, int> starting_worker_processes;
/// A map for looking up the task with dynamic options by the pid of
/// worker. Note that this is used for the dedicated worker processes.
std::unordered_map<ProcessHandle, TaskID> dedicated_workers_to_tasks;
std::unordered_map<pid_t, TaskID> dedicated_workers_to_tasks;
/// A map for speeding up looking up the pending worker for the given task.
std::unordered_map<TaskID, ProcessHandle> tasks_to_dedicated_workers;
std::unordered_map<TaskID, pid_t> tasks_to_dedicated_workers;
/// We'll push a warning to the user every time a multiple of this many
/// worker processes has been started.
int multiple_for_warning;
@@ -222,8 +217,6 @@ class WorkerPool {
/// for a given language.
State &GetStateForLanguage(const Language &language);
/// Required by Boost.Process for managing subprocesses (e.g. reaping zombies).
boost::asio::io_service *io_service_;
/// The maximum number of worker processes that can be started concurrently.
int maximum_startup_concurrency_;
/// A client connection to the GCS.
+39 -61
View File
@@ -16,19 +16,17 @@ std::vector<Language> LANGUAGES = {Language::PYTHON, Language::JAVA};
class WorkerPoolMock : public WorkerPool {
public:
WorkerPoolMock(boost::asio::io_service &io_service)
WorkerPoolMock()
: WorkerPoolMock(
io_service,
{{Language::PYTHON,
{"dummy_py_worker_command", "--foo=RAY_WORKER_NUM_WORKERS_PLACEHOLDER"}},
{Language::JAVA,
{"dummy_java_worker_command",
"--foo=RAY_WORKER_NUM_WORKERS_PLACEHOLDER"}}}) {}
explicit WorkerPoolMock(boost::asio::io_service &io_service,
const WorkerCommandMap &worker_commands)
: WorkerPool(io_service, 0, MAXIMUM_STARTUP_CONCURRENCY, nullptr, worker_commands),
last_worker_process_() {
explicit WorkerPoolMock(const WorkerCommandMap &worker_commands)
: WorkerPool(0, MAXIMUM_STARTUP_CONCURRENCY, nullptr, worker_commands),
last_worker_pid_(0) {
for (auto &entry : states_by_lang_) {
entry.second.num_workers_per_process = NUM_WORKERS_PER_PROCESS;
}
@@ -39,30 +37,23 @@ class WorkerPoolMock : public WorkerPool {
states_by_lang_.clear();
}
using WorkerPool::StartWorkerProcess; // we need this to be public for testing
void StartWorkerProcess(const Language &language,
const std::vector<std::string> &dynamic_options = {}) {
WorkerPool::StartWorkerProcess(language, dynamic_options);
}
ProcessHandle StartProcess(
const std::vector<std::string> &worker_command_args) override {
#ifndef PID_MAX_LIMIT
// This is defined by Linux to be the maximum allowable number of processes
// There's no guarantee for other OSes, but it's good enough for testing...
enum { PID_MAX_LIMIT = 1 << 22 };
#endif
// Use a bogus process ID that won't conflict with those in the system
pid_t pid = static_cast<pid_t>(PID_MAX_LIMIT + 1 + worker_commands_by_proc_.size());
Process proc(pid);
proc.detach();
last_worker_process_ = std::make_shared<Process>(std::move(proc));
worker_commands_by_proc_[last_worker_process_] = worker_command_args;
return last_worker_process_;
pid_t StartProcess(const std::vector<std::string> &worker_command_args) override {
last_worker_pid_ += 1;
worker_commands_by_pid[last_worker_pid_] = worker_command_args;
return last_worker_pid_;
}
void WarnAboutSize() override {}
ProcessHandle LastStartedWorkerProcess() const { return last_worker_process_; }
pid_t LastStartedWorkerProcess() const { return last_worker_pid_; }
const std::vector<std::string> &GetWorkerCommand(ProcessHandle proc) {
return worker_commands_by_proc_[proc];
const std::vector<std::string> &GetWorkerCommand(int pid) {
return worker_commands_by_pid[pid];
}
int NumWorkersStarting() const {
@@ -84,19 +75,20 @@ class WorkerPoolMock : public WorkerPool {
}
private:
ProcessHandle last_worker_process_;
// The worker commands by process.
std::unordered_map<ProcessHandle, std::vector<std::string>> worker_commands_by_proc_;
int last_worker_pid_;
// The worker commands by pid.
std::unordered_map<int, std::vector<std::string>> worker_commands_by_pid;
};
class WorkerPoolTest : public ::testing::Test {
public:
WorkerPoolTest()
: worker_pool_(io_service_),
: worker_pool_(),
io_service_(),
error_message_type_(1),
client_call_manager_(io_service_) {}
std::shared_ptr<Worker> CreateWorker(ProcessHandle proc,
std::shared_ptr<Worker> CreateWorker(pid_t pid,
const Language &language = Language::PYTHON) {
std::function<void(LocalClientConnection &)> client_handler =
[this](LocalClientConnection &client) { HandleNewClient(client); };
@@ -109,22 +101,18 @@ class WorkerPoolTest : public ::testing::Test {
auto client =
LocalClientConnection::Create(client_handler, message_handler, std::move(socket),
"worker", {}, error_message_type_);
std::shared_ptr<Worker> worker = std::make_shared<Worker>(
WorkerID::FromRandom(), language, -1, client, client_call_manager_);
if (proc) {
worker->SetProcess(proc);
}
return worker;
return std::shared_ptr<Worker>(new Worker(WorkerID::FromRandom(), pid, language, -1,
client, client_call_manager_));
}
void SetWorkerCommands(const WorkerCommandMap &worker_commands) {
WorkerPoolMock worker_pool(io_service_, worker_commands);
WorkerPoolMock worker_pool(worker_commands);
this->worker_pool_ = std::move(worker_pool);
}
protected:
boost::asio::io_service io_service_;
WorkerPoolMock worker_pool_;
boost::asio::io_service io_service_;
int64_t error_message_type_;
rpc::ClientCallManager client_call_manager_;
@@ -154,21 +142,12 @@ static inline TaskSpecification ExampleTaskSpec(
return TaskSpecification(std::move(message));
}
TEST_F(WorkerPoolTest, CompareWorkerProcessObjects) {
typedef ProcessHandle T;
T a(std::make_shared<Process>()), b(std::make_shared<Process>()), empty = T();
ASSERT_TRUE(std::equal_to<T>()(a, a));
ASSERT_TRUE(!std::equal_to<T>()(a, b));
ASSERT_TRUE(!std::equal_to<T>()(b, a));
ASSERT_TRUE(!std::equal_to<T>()(empty, a));
ASSERT_TRUE(!std::equal_to<T>()(a, empty));
}
TEST_F(WorkerPoolTest, HandleWorkerRegistration) {
ProcessHandle proc = worker_pool_.StartWorkerProcess(Language::PYTHON);
worker_pool_.StartWorkerProcess(Language::PYTHON);
pid_t pid = worker_pool_.LastStartedWorkerProcess();
std::vector<std::shared_ptr<Worker>> workers;
for (int i = 0; i < NUM_WORKERS_PER_PROCESS; i++) {
workers.push_back(CreateWorker(ProcessHandle()));
workers.push_back(CreateWorker(pid));
}
for (const auto &worker : workers) {
// Check that there's still a starting worker process
@@ -176,7 +155,7 @@ TEST_F(WorkerPoolTest, HandleWorkerRegistration) {
ASSERT_EQ(worker_pool_.NumWorkerProcessesStarting(), 1);
// Check that we cannot lookup the worker before it's registered.
ASSERT_EQ(worker_pool_.GetRegisteredWorker(worker->Connection()), nullptr);
RAY_CHECK_OK(worker_pool_.RegisterWorker(worker, proc.get()->id()));
RAY_CHECK_OK(worker_pool_.RegisterWorker(worker));
// Check that we can lookup the worker after it's registered.
ASSERT_EQ(worker_pool_.GetRegisteredWorker(worker->Connection()), worker);
}
@@ -202,21 +181,20 @@ TEST_F(WorkerPoolTest, StartupWorkerProcessCount) {
ASSERT_TRUE(expected_worker_process_count <
static_cast<int>(desired_initial_worker_process_count_per_language *
LANGUAGES.size()));
ProcessHandle last_started_worker_process;
pid_t last_started_worker_process = 0;
for (int i = 0; i < desired_initial_worker_process_count_per_language; i++) {
for (size_t j = 0; j < LANGUAGES.size(); j++) {
worker_pool_.StartWorkerProcess(LANGUAGES[j]);
ASSERT_TRUE(worker_pool_.NumWorkerProcessesStarting() <=
expected_worker_process_count);
ProcessHandle prev = worker_pool_.LastStartedWorkerProcess();
if (last_started_worker_process.get() != prev.get()) {
last_started_worker_process = prev;
if (last_started_worker_process != worker_pool_.LastStartedWorkerProcess()) {
last_started_worker_process = worker_pool_.LastStartedWorkerProcess();
const auto &real_command =
worker_pool_.GetWorkerCommand(worker_pool_.LastStartedWorkerProcess());
ASSERT_EQ(real_command, worker_commands[j]);
} else {
ASSERT_EQ(worker_pool_.NumWorkerProcessesStarting(),
expected_worker_process_count);
ASSERT_TRUE(worker_pool_.NumWorkerProcessesStarting() ==
expected_worker_process_count);
ASSERT_TRUE(static_cast<int>(i * LANGUAGES.size() + j) >=
expected_worker_process_count);
}
@@ -246,8 +224,8 @@ TEST_F(WorkerPoolTest, HandleWorkerPushPop) {
// Create some workers.
std::unordered_set<std::shared_ptr<Worker>> workers;
workers.insert(CreateWorker(std::make_shared<Process>()));
workers.insert(CreateWorker(std::make_shared<Process>()));
workers.insert(CreateWorker(1234));
workers.insert(CreateWorker(5678));
// Add the workers to the pool.
for (auto &worker : workers) {
worker_pool_.PushWorker(worker);
@@ -266,7 +244,7 @@ TEST_F(WorkerPoolTest, HandleWorkerPushPop) {
TEST_F(WorkerPoolTest, PopActorWorker) {
// Create a worker.
auto worker = CreateWorker(std::make_shared<Process>());
auto worker = CreateWorker(1234);
// Add the worker to the pool.
worker_pool_.PushWorker(worker);
@@ -289,7 +267,7 @@ TEST_F(WorkerPoolTest, PopActorWorker) {
TEST_F(WorkerPoolTest, PopWorkersOfMultipleLanguages) {
// Create a Python Worker, and add it to the pool
auto py_worker = CreateWorker(std::make_shared<Process>(), Language::PYTHON);
auto py_worker = CreateWorker(1234, Language::PYTHON);
worker_pool_.PushWorker(py_worker);
// Check that no worker will be popped if the given task is a Java task
const auto java_task_spec = ExampleTaskSpec(ActorID::Nil(), Language::JAVA);
@@ -299,7 +277,7 @@ TEST_F(WorkerPoolTest, PopWorkersOfMultipleLanguages) {
ASSERT_NE(worker_pool_.PopWorker(py_task_spec), nullptr);
// Create a Java Worker, and add it to the pool
auto java_worker = CreateWorker(std::make_shared<Process>(), Language::JAVA);
auto java_worker = CreateWorker(1234, Language::JAVA);
worker_pool_.PushWorker(java_worker);
// Check that the worker will be popped now for Java task
ASSERT_NE(worker_pool_.PopWorker(java_task_spec), nullptr);
-123
View File
@@ -1,123 +0,0 @@
#ifndef RAY_UTIL_PROCESS_H
#define RAY_UTIL_PROCESS_H
#ifdef __linux__
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
#endif
#include <algorithm>
#include <boost/asio/io_service.hpp>
#include <boost/process/args.hpp>
#include <boost/process/child.hpp>
#include <functional>
#include <memory>
#include <utility>
// We only define operators required by the standard library (==, hash).
// We declare but avoid defining the rest so that they're not used by accident.
namespace ray {
typedef boost::process::pid_t pid_t;
class Process : public boost::process::child {
protected:
class ProcessFD {
// This class makes a best-effort attempt to keep a PID alive.
// However, it cannot make any guarantees.
// The kernel might not even support this mechanism.
// See here: https://unix.stackexchange.com/a/181249
#ifdef __linux__
int fd_;
#endif
public:
#ifdef __linux__
~ProcessFD() {
if (fd_ != -1) {
::close(fd_);
}
}
ProcessFD(pid_t pid) : fd_(-1) {
if (pid != -1) {
char path[64];
sprintf(path, "/proc/%d/ns/pid", static_cast<int>(pid));
fd_ = ::open(path, O_RDONLY);
}
}
ProcessFD(ProcessFD &&other) : fd_(std::move(other.fd_)) { other.fd_ = -1; }
ProcessFD(const ProcessFD &other) : fd_(other.fd_ != -1 ? ::dup(other.fd_) : -1) {}
ProcessFD &operator=(ProcessFD other) {
using std::swap;
swap(fd_, other.fd_);
return *this;
}
#else
ProcessFD(pid_t) {}
#endif
};
ProcessFD fd_;
public:
template <typename... T>
explicit Process(T &&... args)
: boost::process::child(std::forward<T>(args)...),
fd_(boost::process::child::id()) {}
};
/// A managed equivalent of a pid_t (to manage the lifetime of each process).
/// TODO(mehrdadn): This hasn't been a great design, but we play along to
/// minimize the changes needed for Windows compatibility.
/// (We used to represent a worker process by just its pid_t, which carries
/// no ownership/lifetime information.)
/// Once this code is running properly, refactor the data structures to create
/// a better ownership structure between the worker processes and the workers.
class ProcessHandle {
std::shared_ptr<Process> proc_;
public:
ProcessHandle(const std::shared_ptr<Process> &proc = std::shared_ptr<Process>())
: proc_(proc) {}
Process *get() const { return proc_.get(); }
explicit operator bool() const { return !!proc_; }
static ProcessHandle FromPid(pid_t pid) {
Process temp(pid);
temp.detach();
return std::make_shared<Process>(std::move(temp));
}
};
} // namespace ray
// Define comparators for process handles:
// - Valid process objects must be distinguished by their IDs.
// - Invalid process objects must be distinguished by their addresses.
namespace std {
template <>
struct equal_to<ray::ProcessHandle> {
bool operator()(const ray::ProcessHandle &x, const ray::ProcessHandle &y) const {
const ray::Process *a = x.get(), *b = y.get();
// See explanation above
return a ? b ? a->valid()
? b->valid() ? equal_to<ray::pid_t>()(a->id(), b->id()) : false
: b->valid() ? false : equal_to<void const *>()(a, b)
: false
: !b;
}
};
template <>
struct hash<ray::ProcessHandle> {
size_t operator()(const ray::ProcessHandle &value) const {
const ray::Process *p = value.get();
// See explanation above
return p ? p->valid() ? hash<ray::pid_t>()(p->id()) : hash<void const *>()(p)
: size_t();
}
};
} // namespace std
#endif
@@ -1,14 +0,0 @@
From 28126b3432433c025606a84474c7afb5dec88daf Mon Sep 17 00:00:00 2001
From: Klemens David Morgenstern <klemens.morgenstern@gmx.net>
Date: Sun, 12 May 2019 17:02:25 +0700
Subject: [PATCH] osx fix
---
diff --git include/boost/process/detail/posix/terminate.hpp include/boost/process/detail/posix/terminate.hpp
index 84024a5..e1e5f33 100644
--- boost/process/detail/posix/terminate.hpp
+++ boost/process/detail/posix/terminate.hpp
@@ -30,1 +30,1 @@
- ::waitpid(p.pid, &status, 0); //just to clean it up
+ ::waitpid(p.pid, &status, WNOHANG); //just to clean it up
--