From d29fcfb45c757a2c3bd8aafd9a470fa946dadf07 Mon Sep 17 00:00:00 2001 From: Kai Fricke Date: Tue, 2 Feb 2021 14:52:09 +0100 Subject: [PATCH] [tune] catch SIGINT signal and trigger experiment checkpoint (#13767) * [tune] catch SIGINT signal and trigger experiment checkpoint * Apply suggestions from code review * Fix user guide docs * Update doc/source/tune/user-guide.rst --- doc/source/tune/user-guide.rst | 59 ++++++++++++++++++++ python/ray/tune/tests/test_tune_restore.py | 62 ++++++++++++++++++++++ python/ray/tune/tune.py | 33 ++++++++++-- 3 files changed, 151 insertions(+), 3 deletions(-) diff --git a/doc/source/tune/user-guide.rst b/doc/source/tune/user-guide.rst index a830791d0..909ebbc9f 100644 --- a/doc/source/tune/user-guide.rst +++ b/doc/source/tune/user-guide.rst @@ -261,6 +261,7 @@ You can restore a single trial checkpoint by using ``tune.run(restore= ExperimentAnalysis: """Executes training. + When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run + will gracefully shut down and checkpoint the latest experiment state. + Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step. + Examples: .. code-block:: python @@ -265,7 +271,6 @@ def run( `LoggerCallback` and `SyncerCallback` callbacks are automatically added. - Returns: ExperimentAnalysis: Object for experiment analysis. @@ -427,8 +432,24 @@ def run( "`Trainable.default_resource_request` if using the " "Trainable API.") + original_handler = signal.getsignal(signal.SIGINT) + state = {signal.SIGINT: False} + + def sigint_handler(sig, frame): + logger.warning( + "SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. " + "This will try to checkpoint the experiment state one last time. " + "Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) " + "to skip. ") + state[signal.SIGINT] = True + # Restore original signal handler to react to future SIGINT signals + signal.signal(signal.SIGINT, original_handler) + + if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")): + signal.signal(signal.SIGINT, sigint_handler) + tune_start = time.time() - while not runner.is_finished(): + while not runner.is_finished() and not state[signal.SIGINT]: runner.step() if has_verbosity(Verbosity.V1_EXPERIMENT): _report_progress(runner, progress_reporter) @@ -451,7 +472,7 @@ def run( incomplete_trials += [trial] if incomplete_trials: - if raise_on_failed_trial: + if raise_on_failed_trial and not state[signal.SIGINT]: raise TuneError("Trials did not complete", incomplete_trials) else: logger.error("Trials did not complete: %s", incomplete_trials) @@ -461,6 +482,12 @@ def run( logger.info(f"Total run time: {all_taken:.2f} seconds " f"({tune_taken:.2f} seconds for the tuning loop).") + if state[signal.SIGINT]: + logger.warning( + "Experiment has been interrupted, but the most recent state was " + "saved. You can continue running this experiment by passing " + "`resume=True` to `tune.run()`") + trials = runner.get_trials() return ExperimentAnalysis( runner.checkpoint_file,