Fix Java CI crash caused by incorrect destruction order in core worker (#10709)

This commit is contained in:
Kai Yang
2020-09-11 17:33:09 +08:00
committed by Barak Michener
parent df77a31242
commit edd9916e30
3 changed files with 10 additions and 7 deletions
+4
View File
@@ -16,6 +16,10 @@ run_testng() {
fi
# exit_code == 2 means there are skipped tests.
if [ $exit_code -ne 2 ] && [ $exit_code -ne 0 ] ; then
if [ $exit_code -gt 128 ] ; then
# Test crashed. Print the driver log for diagnosis.
cat /tmp/ray/session_latest/logs/java-core-driver-*
fi
find . -name "hs_err_*log" -exec cat {} +
exit $exit_code
fi
+5 -6
View File
@@ -532,12 +532,8 @@ void CoreWorker::Shutdown() {
}
void CoreWorker::Disconnect() {
io_service_.stop();
if (connected_) {
connected_ = false;
if (gcs_client_) {
gcs_client_->Disconnect();
}
if (local_raylet_client_) {
RAY_IGNORE_EXPR(local_raylet_client_->Disconnect());
}
@@ -630,6 +626,9 @@ void CoreWorker::WaitForShutdown() {
if (io_thread_.joinable()) {
io_thread_.join();
}
if (gcs_client_) {
gcs_client_->Disconnect();
}
if (options_.worker_type == WorkerType::WORKER) {
RAY_CHECK(task_execution_service_.stopped());
// Asyncio coroutines could still run after CoreWorker is removed because it is
@@ -2093,7 +2092,7 @@ void CoreWorker::HandleCancelTask(const rpc::CancelTaskRequest &request,
// Do force kill after reply callback sent
if (success && request.force_kill()) {
RAY_LOG(INFO) << "Force killing a worker running " << main_thread_task_id_;
RAY_IGNORE_EXPR(local_raylet_client_->Disconnect());
Disconnect();
if (options_.enable_logging) {
RayLog::ShutDownRayLog();
}
@@ -2122,7 +2121,7 @@ void CoreWorker::HandleKillActor(const rpc::KillActorRequest &request,
if (request.force_kill()) {
RAY_LOG(INFO) << "Got KillActor, exiting immediately...";
if (request.no_restart()) {
RAY_IGNORE_EXPR(local_raylet_client_->Disconnect());
Disconnect();
}
if (options_.num_workers > 1) {
// TODO (kfstorm): Should we add some kind of check before sending the killing
+1 -1
View File
@@ -319,7 +319,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
/// Public methods used by `CoreWorkerProcess` and `CoreWorker` itself.
///
/// Gracefully disconnect the worker from other components of ray. e.g. Raylet.
/// Gracefully disconnect the worker from Raylet.
/// If this function is called during shutdown, Raylet will treat it as an intentional
/// disconnect.
///