mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 14:48:54 +08:00
Fix Java CI crash caused by incorrect destruction order in core worker (#10709)
This commit is contained in:
@@ -16,6 +16,10 @@ run_testng() {
|
||||
fi
|
||||
# exit_code == 2 means there are skipped tests.
|
||||
if [ $exit_code -ne 2 ] && [ $exit_code -ne 0 ] ; then
|
||||
if [ $exit_code -gt 128 ] ; then
|
||||
# Test crashed. Print the driver log for diagnosis.
|
||||
cat /tmp/ray/session_latest/logs/java-core-driver-*
|
||||
fi
|
||||
find . -name "hs_err_*log" -exec cat {} +
|
||||
exit $exit_code
|
||||
fi
|
||||
|
||||
@@ -532,12 +532,8 @@ void CoreWorker::Shutdown() {
|
||||
}
|
||||
|
||||
void CoreWorker::Disconnect() {
|
||||
io_service_.stop();
|
||||
if (connected_) {
|
||||
connected_ = false;
|
||||
if (gcs_client_) {
|
||||
gcs_client_->Disconnect();
|
||||
}
|
||||
if (local_raylet_client_) {
|
||||
RAY_IGNORE_EXPR(local_raylet_client_->Disconnect());
|
||||
}
|
||||
@@ -630,6 +626,9 @@ void CoreWorker::WaitForShutdown() {
|
||||
if (io_thread_.joinable()) {
|
||||
io_thread_.join();
|
||||
}
|
||||
if (gcs_client_) {
|
||||
gcs_client_->Disconnect();
|
||||
}
|
||||
if (options_.worker_type == WorkerType::WORKER) {
|
||||
RAY_CHECK(task_execution_service_.stopped());
|
||||
// Asyncio coroutines could still run after CoreWorker is removed because it is
|
||||
@@ -2093,7 +2092,7 @@ void CoreWorker::HandleCancelTask(const rpc::CancelTaskRequest &request,
|
||||
// Do force kill after reply callback sent
|
||||
if (success && request.force_kill()) {
|
||||
RAY_LOG(INFO) << "Force killing a worker running " << main_thread_task_id_;
|
||||
RAY_IGNORE_EXPR(local_raylet_client_->Disconnect());
|
||||
Disconnect();
|
||||
if (options_.enable_logging) {
|
||||
RayLog::ShutDownRayLog();
|
||||
}
|
||||
@@ -2122,7 +2121,7 @@ void CoreWorker::HandleKillActor(const rpc::KillActorRequest &request,
|
||||
if (request.force_kill()) {
|
||||
RAY_LOG(INFO) << "Got KillActor, exiting immediately...";
|
||||
if (request.no_restart()) {
|
||||
RAY_IGNORE_EXPR(local_raylet_client_->Disconnect());
|
||||
Disconnect();
|
||||
}
|
||||
if (options_.num_workers > 1) {
|
||||
// TODO (kfstorm): Should we add some kind of check before sending the killing
|
||||
|
||||
@@ -319,7 +319,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
|
||||
/// Public methods used by `CoreWorkerProcess` and `CoreWorker` itself.
|
||||
///
|
||||
|
||||
/// Gracefully disconnect the worker from other components of ray. e.g. Raylet.
|
||||
/// Gracefully disconnect the worker from Raylet.
|
||||
/// If this function is called during shutdown, Raylet will treat it as an intentional
|
||||
/// disconnect.
|
||||
///
|
||||
|
||||
Reference in New Issue
Block a user