mirror of
https://github.com/wassname/ray.git
synced 2026-07-03 03:27:50 +08:00
[tune] Cluster Fault Tolerance (#3309)
This PR introduces cluster-level fault tolerance for Tune by checkpointing global state. This occurs with relatively high frequency and allows users to easily resume experiments when the cluster crashes. Note that this PR may affect automated workflows due to auto-prompting, but this is resolvable.
This commit is contained in:
@@ -51,7 +51,9 @@ class Cluster(object):
|
||||
assert not self.connected
|
||||
redis_password = head_node_args.get("redis_password")
|
||||
output_info = ray.init(
|
||||
redis_address=self.redis_address, redis_password=redis_password)
|
||||
ignore_reinit_error=True,
|
||||
redis_address=self.redis_address,
|
||||
redis_password=redis_password)
|
||||
logger.info(output_info)
|
||||
self.connected = True
|
||||
|
||||
|
||||
Reference in New Issue
Block a user