[tune] Change log handling for Tune (#3661)

Also provides a small retry mechanism for a transient error as reported
by #3340.

Closes #3653.
This commit is contained in:
Richard Liaw
2019-01-06 13:20:10 -08:00
committed by GitHub
parent 681e8cd3fd
commit 8934e37a78
5 changed files with 33 additions and 11 deletions
+16
View File
@@ -2,6 +2,8 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
from ray.tune.error import TuneError
from ray.tune.tune import run_experiments
from ray.tune.experiment import Experiment
@@ -9,6 +11,20 @@ from ray.tune.registry import register_env, register_trainable
from ray.tune.trainable import Trainable
from ray.tune.suggest import grid_search, function, sample_from
def _setup_logger():
logger = logging.getLogger("ray.tune")
handler = logging.StreamHandler()
handler.setFormatter(
logging.Formatter(
"%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s"
))
logger.addHandler(handler)
logger.propagate = False
_setup_logger()
__all__ = [
"Trainable",
"TuneError",
+3 -3
View File
@@ -110,7 +110,7 @@ class _LogSyncer(object):
self.sync_process = None
self.local_ip = ray.services.get_node_ip_address()
self.worker_ip = None
logger.info("Created LogSyncer for {} -> {}".format(
logger.debug("Created LogSyncer for {} -> {}".format(
local_dir, remote_dir))
def set_worker_ip(self, worker_ip):
@@ -125,7 +125,7 @@ class _LogSyncer(object):
def sync_now(self, force=False):
self.last_sync_time = time.time()
if not self.worker_ip:
logger.info("Worker ip unknown, skipping log sync for {}".format(
logger.debug("Worker ip unknown, skipping log sync for {}".format(
self.local_dir))
return
@@ -178,7 +178,7 @@ class _LogSyncer(object):
if final_cmd:
final_cmd += " && "
final_cmd += local_to_remote_sync_cmd
logger.info("Running log sync: {}".format(final_cmd))
logger.debug("Running log sync: {}".format(final_cmd))
self.sync_process = subprocess.Popen(final_cmd, shell=True)
def wait(self):
+8 -2
View File
@@ -234,10 +234,16 @@ class RayTrialExecutor(TrialExecutor):
assert self._committed_resources.cpu >= 0
assert self._committed_resources.gpu >= 0
def _update_avail_resources(self):
resources = ray.global_state.cluster_resources()
def _update_avail_resources(self, num_retries=5):
for i in range(num_retries):
resources = ray.global_state.cluster_resources()
if not resources:
logger.warning("Cluster resources not detected. Retrying...")
time.sleep(0.5)
num_cpus = resources["CPU"]
num_gpus = resources["GPU"]
self._avail_resources = Resources(int(num_cpus), int(num_gpus))
self._resources_initialized = True
+2 -3
View File
@@ -355,9 +355,8 @@ class Trial(object):
result.update(done=True)
if self.verbose and (terminate or time.time() - self.last_debug >
DEBUG_PRINT_INTERVAL):
logger.info("Result for {}:".format(self))
logger.info(" {}".format(
pretty_print(result).replace("\n", "\n ")))
print("Result for {}:".format(self))
print(" {}".format(pretty_print(result).replace("\n", "\n ")))
self.last_debug = time.time()
self.last_result = result
self.last_update_time = time.time()
+4 -3
View File
@@ -163,15 +163,16 @@ def run_experiments(experiments,
queue_trials=queue_trials,
trial_executor=trial_executor)
logger.info(runner.debug_string(max_debug=99999))
print(runner.debug_string(max_debug=99999))
last_debug = 0
while not runner.is_finished():
runner.step()
if time.time() - last_debug > DEBUG_PRINT_INTERVAL:
logger.info(runner.debug_string())
print(runner.debug_string())
last_debug = time.time()
logger.info(runner.debug_string(max_debug=99999))
print(runner.debug_string(max_debug=99999))
wait_for_log_sync()