From edd9916e3076e2b99ef009a861ab2347e91c1cf9 Mon Sep 17 00:00:00 2001 From: Kai Yang Date: Fri, 11 Sep 2020 17:33:09 +0800 Subject: [PATCH] Fix Java CI crash caused by incorrect destruction order in core worker (#10709) --- java/test.sh | 4 ++++ src/ray/core_worker/core_worker.cc | 11 +++++------ src/ray/core_worker/core_worker.h | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/java/test.sh b/java/test.sh index 36a92f259..424adc82b 100755 --- a/java/test.sh +++ b/java/test.sh @@ -16,6 +16,10 @@ run_testng() { fi # exit_code == 2 means there are skipped tests. if [ $exit_code -ne 2 ] && [ $exit_code -ne 0 ] ; then + if [ $exit_code -gt 128 ] ; then + # Test crashed. Print the driver log for diagnosis. + cat /tmp/ray/session_latest/logs/java-core-driver-* + fi find . -name "hs_err_*log" -exec cat {} + exit $exit_code fi diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index d62832536..a9ca05baa 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -532,12 +532,8 @@ void CoreWorker::Shutdown() { } void CoreWorker::Disconnect() { - io_service_.stop(); if (connected_) { connected_ = false; - if (gcs_client_) { - gcs_client_->Disconnect(); - } if (local_raylet_client_) { RAY_IGNORE_EXPR(local_raylet_client_->Disconnect()); } @@ -630,6 +626,9 @@ void CoreWorker::WaitForShutdown() { if (io_thread_.joinable()) { io_thread_.join(); } + if (gcs_client_) { + gcs_client_->Disconnect(); + } if (options_.worker_type == WorkerType::WORKER) { RAY_CHECK(task_execution_service_.stopped()); // Asyncio coroutines could still run after CoreWorker is removed because it is @@ -2093,7 +2092,7 @@ void CoreWorker::HandleCancelTask(const rpc::CancelTaskRequest &request, // Do force kill after reply callback sent if (success && request.force_kill()) { RAY_LOG(INFO) << "Force killing a worker running " << main_thread_task_id_; - RAY_IGNORE_EXPR(local_raylet_client_->Disconnect()); + Disconnect(); if (options_.enable_logging) { RayLog::ShutDownRayLog(); } @@ -2122,7 +2121,7 @@ void CoreWorker::HandleKillActor(const rpc::KillActorRequest &request, if (request.force_kill()) { RAY_LOG(INFO) << "Got KillActor, exiting immediately..."; if (request.no_restart()) { - RAY_IGNORE_EXPR(local_raylet_client_->Disconnect()); + Disconnect(); } if (options_.num_workers > 1) { // TODO (kfstorm): Should we add some kind of check before sending the killing diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 3fa81bedb..900c4c482 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -319,7 +319,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// Public methods used by `CoreWorkerProcess` and `CoreWorker` itself. /// - /// Gracefully disconnect the worker from other components of ray. e.g. Raylet. + /// Gracefully disconnect the worker from Raylet. /// If this function is called during shutdown, Raylet will treat it as an intentional /// disconnect. ///