mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 02:56:05 +08:00
## What do these changes do? It saves checkpoint if needed regardless of what the scheduler have returned. Until now, it have not saved the checkpoint when scheduler returned TrialScheduler.PAUSE, which caused PopulationBasedTraining preventing to save any checkpoints in certain cases. See issue #4041 for more details. ## Related issue number #4041
This commit is contained in:
committed by
Richard Liaw
parent
64c95aea85
commit
b2677fabc0
@@ -405,15 +405,17 @@ class TrialRunner(object):
|
||||
trial.update_last_result(
|
||||
result, terminate=(decision == TrialScheduler.STOP))
|
||||
|
||||
# Checkpoints to disk. This should be checked even if
|
||||
# the scheduler decision is STOP or PAUSE. Note that
|
||||
# PAUSE only checkpoints to memory and does not update
|
||||
# the global checkpoint state.
|
||||
self._checkpoint_trial_if_needed(trial)
|
||||
|
||||
if decision == TrialScheduler.CONTINUE:
|
||||
self._checkpoint_trial_if_needed(trial)
|
||||
self.trial_executor.continue_training(trial)
|
||||
elif decision == TrialScheduler.PAUSE:
|
||||
self.trial_executor.pause_trial(trial)
|
||||
elif decision == TrialScheduler.STOP:
|
||||
# Checkpoint before ending the trial
|
||||
# if checkpoint_at_end experiment option is set to True
|
||||
self._checkpoint_trial_if_needed(trial)
|
||||
self.trial_executor.export_trial_if_needed(trial)
|
||||
self.trial_executor.stop_trial(trial)
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user