From b2677fabc00ac651bc4a1ecf4c4fe8bf6c3b6bd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Csord=C3=A1s=20R=C3=B3bert?= Date: Wed, 20 Feb 2019 20:54:28 +0100 Subject: [PATCH] [tune] Fix not saving a checkpoint in certain cases (issue #4041) (#4053) ## What do these changes do? It saves checkpoint if needed regardless of what the scheduler have returned. Until now, it have not saved the checkpoint when scheduler returned TrialScheduler.PAUSE, which caused PopulationBasedTraining preventing to save any checkpoints in certain cases. See issue #4041 for more details. ## Related issue number #4041 --- python/ray/tune/trial_runner.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/ray/tune/trial_runner.py b/python/ray/tune/trial_runner.py index 93eaf1068..3149cab13 100644 --- a/python/ray/tune/trial_runner.py +++ b/python/ray/tune/trial_runner.py @@ -405,15 +405,17 @@ class TrialRunner(object): trial.update_last_result( result, terminate=(decision == TrialScheduler.STOP)) + # Checkpoints to disk. This should be checked even if + # the scheduler decision is STOP or PAUSE. Note that + # PAUSE only checkpoints to memory and does not update + # the global checkpoint state. + self._checkpoint_trial_if_needed(trial) + if decision == TrialScheduler.CONTINUE: - self._checkpoint_trial_if_needed(trial) self.trial_executor.continue_training(trial) elif decision == TrialScheduler.PAUSE: self.trial_executor.pause_trial(trial) elif decision == TrialScheduler.STOP: - # Checkpoint before ending the trial - # if checkpoint_at_end experiment option is set to True - self._checkpoint_trial_if_needed(trial) self.trial_executor.export_trial_if_needed(trial) self.trial_executor.stop_trial(trial) else: