diff --git a/python/ray/tune/experiment.py b/python/ray/tune/experiment.py index 4471edd2b..31a1ce7a8 100644 --- a/python/ray/tune/experiment.py +++ b/python/ray/tune/experiment.py @@ -74,7 +74,8 @@ class Experiment(object): experiment regardless of the checkpoint_freq. Default is False. max_failures (int): Try to recover a trial from its last checkpoint at least this many times. Only applies if - checkpointing is enabled. Defaults to 3. + checkpointing is enabled. Setting to -1 will lead to infinite + recovery retries. Defaults to 3. restore (str): Path to checkpoint. Only makes sense to set if running 1 trial. Defaults to None. repeat: Deprecated and will be removed in future versions of diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index 0dd571942..12693a6e3 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -356,7 +356,8 @@ class Trial(object): be a checkpoint. """ return (self.checkpoint_freq > 0 - and self.num_failures < self.max_failures) + and (self.num_failures < self.max_failures + or self.max_failures < 0)) def update_last_result(self, result, terminate=False): if terminate: