From 82f9c7014e2d0acd3e3869066f5dc3142ec9e7a7 Mon Sep 17 00:00:00 2001 From: Gekho457 <62982571+Gekho457@users.noreply.github.com> Date: Thu, 17 Dec 2020 09:41:48 -0800 Subject: [PATCH] [K8s] Retry getting home directory in command runner. (#12925) --- .../ray/autoscaler/_private/command_runner.py | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/python/ray/autoscaler/_private/command_runner.py b/python/ray/autoscaler/_private/command_runner.py index 075efa377..f350ff1f3 100644 --- a/python/ray/autoscaler/_private/command_runner.py +++ b/python/ray/autoscaler/_private/command_runner.py @@ -35,6 +35,8 @@ logger = logging.getLogger(__name__) HASH_MAX_LENGTH = 10 KUBECTL_RSYNC = os.path.join( os.path.dirname(os.path.abspath(__file__)), "kubernetes/kubectl-rsync.sh") +MAX_HOME_RETRIES = 3 +HOME_RETRY_DELAY_S = 5 _config = {"use_login_shells": True, "silent_rsync": True} @@ -248,16 +250,31 @@ class KubernetesCommandRunner(CommandRunnerInterface): @property def _home(self): + if self._home_cached is not None: + return self._home_cached + for _ in range(MAX_HOME_RETRIES - 1): + try: + self._home_cached = self._try_to_get_home() + return self._home_cached + except Exception: + # TODO (Dmitri): Identify the exception we're trying to avoid. + logger.info("Error reading container's home directory. " + f"Retrying in {HOME_RETRY_DELAY_S} seconds.") + time.sleep(HOME_RETRY_DELAY_S) + # Last try + self._home_cached = self._try_to_get_home() + return self._home_cached + + def _try_to_get_home(self): # TODO (Dmitri): Think about how to use the node's HOME variable # without making an extra kubectl exec call. - if self._home_cached is None: - cmd = self.kubectl + [ - "exec", "-it", self.node_id, "--", "printenv", "HOME" - ] - joined_cmd = " ".join(cmd) - raw_out = self.process_runner.check_output(joined_cmd, shell=True) - self._home_cached = raw_out.decode().strip("\n\r") - return self._home_cached + cmd = self.kubectl + [ + "exec", "-it", self.node_id, "--", "printenv", "HOME" + ] + joined_cmd = " ".join(cmd) + raw_out = self.process_runner.check_output(joined_cmd, shell=True) + home = raw_out.decode().strip("\n\r") + return home class SSHOptions: