diff --git a/python/ray/tune/__init__.py b/python/ray/tune/__init__.py index 1e341b265..17c5fa2db 100644 --- a/python/ray/tune/__init__.py +++ b/python/ray/tune/__init__.py @@ -2,6 +2,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import logging + from ray.tune.error import TuneError from ray.tune.tune import run_experiments from ray.tune.experiment import Experiment @@ -9,6 +11,20 @@ from ray.tune.registry import register_env, register_trainable from ray.tune.trainable import Trainable from ray.tune.suggest import grid_search, function, sample_from + +def _setup_logger(): + logger = logging.getLogger("ray.tune") + handler = logging.StreamHandler() + handler.setFormatter( + logging.Formatter( + "%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s" + )) + logger.addHandler(handler) + logger.propagate = False + + +_setup_logger() + __all__ = [ "Trainable", "TuneError", diff --git a/python/ray/tune/log_sync.py b/python/ray/tune/log_sync.py index 2046165c0..a63d7b1b7 100644 --- a/python/ray/tune/log_sync.py +++ b/python/ray/tune/log_sync.py @@ -110,7 +110,7 @@ class _LogSyncer(object): self.sync_process = None self.local_ip = ray.services.get_node_ip_address() self.worker_ip = None - logger.info("Created LogSyncer for {} -> {}".format( + logger.debug("Created LogSyncer for {} -> {}".format( local_dir, remote_dir)) def set_worker_ip(self, worker_ip): @@ -125,7 +125,7 @@ class _LogSyncer(object): def sync_now(self, force=False): self.last_sync_time = time.time() if not self.worker_ip: - logger.info("Worker ip unknown, skipping log sync for {}".format( + logger.debug("Worker ip unknown, skipping log sync for {}".format( self.local_dir)) return @@ -178,7 +178,7 @@ class _LogSyncer(object): if final_cmd: final_cmd += " && " final_cmd += local_to_remote_sync_cmd - logger.info("Running log sync: {}".format(final_cmd)) + logger.debug("Running log sync: {}".format(final_cmd)) self.sync_process = subprocess.Popen(final_cmd, shell=True) def wait(self): diff --git a/python/ray/tune/ray_trial_executor.py b/python/ray/tune/ray_trial_executor.py index 384f18802..a2e76f60e 100644 --- a/python/ray/tune/ray_trial_executor.py +++ b/python/ray/tune/ray_trial_executor.py @@ -234,10 +234,16 @@ class RayTrialExecutor(TrialExecutor): assert self._committed_resources.cpu >= 0 assert self._committed_resources.gpu >= 0 - def _update_avail_resources(self): - resources = ray.global_state.cluster_resources() + def _update_avail_resources(self, num_retries=5): + for i in range(num_retries): + resources = ray.global_state.cluster_resources() + if not resources: + logger.warning("Cluster resources not detected. Retrying...") + time.sleep(0.5) + num_cpus = resources["CPU"] num_gpus = resources["GPU"] + self._avail_resources = Resources(int(num_cpus), int(num_gpus)) self._resources_initialized = True diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index 4ebfee187..7228c9fbd 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -355,9 +355,8 @@ class Trial(object): result.update(done=True) if self.verbose and (terminate or time.time() - self.last_debug > DEBUG_PRINT_INTERVAL): - logger.info("Result for {}:".format(self)) - logger.info(" {}".format( - pretty_print(result).replace("\n", "\n "))) + print("Result for {}:".format(self)) + print(" {}".format(pretty_print(result).replace("\n", "\n "))) self.last_debug = time.time() self.last_result = result self.last_update_time = time.time() diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py index 1714f4c4f..f93e3490b 100644 --- a/python/ray/tune/tune.py +++ b/python/ray/tune/tune.py @@ -163,15 +163,16 @@ def run_experiments(experiments, queue_trials=queue_trials, trial_executor=trial_executor) - logger.info(runner.debug_string(max_debug=99999)) + print(runner.debug_string(max_debug=99999)) + last_debug = 0 while not runner.is_finished(): runner.step() if time.time() - last_debug > DEBUG_PRINT_INTERVAL: - logger.info(runner.debug_string()) + print(runner.debug_string()) last_debug = time.time() - logger.info(runner.debug_string(max_debug=99999)) + print(runner.debug_string(max_debug=99999)) wait_for_log_sync()