[tune] Checkpoint and Sync at end (#5699)

This commit is contained in:
Richard Liaw
2019-09-15 15:58:58 -07:00
committed by GitHub
parent baac370099
commit 2b2eb4debb
3 changed files with 8 additions and 5 deletions
+1 -1
View File
@@ -224,7 +224,7 @@ class TFLogger(Logger):
"""
def _init(self):
logger.info("Initializing TFLogger instead of TF2Logger.")
logger.debug("Initializing TFLogger instead of TF2Logger.")
self._file_writer = tf.compat.v1.summary.FileWriter(self.logdir)
def on_result(self, result):
+3 -3
View File
@@ -1153,6 +1153,8 @@ class TestSyncFunctionality(unittest.TestCase):
os.remove(test_file_path)
def testNoSync(self):
"""Sync should not run on a single node."""
def sync_func(source, target):
pass
@@ -1165,9 +1167,7 @@ class TestSyncFunctionality(unittest.TestCase):
"stop": {
"training_iteration": 1
},
"upload_dir": "test",
"sync_to_driver": sync_func,
"sync_to_cloud": sync_func
"sync_to_driver": sync_func
}).trials
self.assertEqual(mock_sync.call_count, 0)
+4 -1
View File
@@ -271,7 +271,10 @@ class TrialRunner(object):
json.dump(runner_state, f, indent=2, cls=_TuneFunctionEncoder)
os.rename(tmp_file_name, self.checkpoint_file)
self._syncer.sync_up_if_needed()
if force:
self._syncer.sync_up()
else:
self._syncer.sync_up_if_needed()
return self._local_checkpoint_dir
def resume(self):