mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 22:53:20 +08:00
[tune] Change log handling for Tune (#3661)
Also provides a small retry mechanism for a transient error as reported by #3340. Closes #3653.
This commit is contained in:
@@ -2,6 +2,8 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
|
||||
from ray.tune.error import TuneError
|
||||
from ray.tune.tune import run_experiments
|
||||
from ray.tune.experiment import Experiment
|
||||
@@ -9,6 +11,20 @@ from ray.tune.registry import register_env, register_trainable
|
||||
from ray.tune.trainable import Trainable
|
||||
from ray.tune.suggest import grid_search, function, sample_from
|
||||
|
||||
|
||||
def _setup_logger():
|
||||
logger = logging.getLogger("ray.tune")
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(
|
||||
logging.Formatter(
|
||||
"%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s"
|
||||
))
|
||||
logger.addHandler(handler)
|
||||
logger.propagate = False
|
||||
|
||||
|
||||
_setup_logger()
|
||||
|
||||
__all__ = [
|
||||
"Trainable",
|
||||
"TuneError",
|
||||
|
||||
@@ -110,7 +110,7 @@ class _LogSyncer(object):
|
||||
self.sync_process = None
|
||||
self.local_ip = ray.services.get_node_ip_address()
|
||||
self.worker_ip = None
|
||||
logger.info("Created LogSyncer for {} -> {}".format(
|
||||
logger.debug("Created LogSyncer for {} -> {}".format(
|
||||
local_dir, remote_dir))
|
||||
|
||||
def set_worker_ip(self, worker_ip):
|
||||
@@ -125,7 +125,7 @@ class _LogSyncer(object):
|
||||
def sync_now(self, force=False):
|
||||
self.last_sync_time = time.time()
|
||||
if not self.worker_ip:
|
||||
logger.info("Worker ip unknown, skipping log sync for {}".format(
|
||||
logger.debug("Worker ip unknown, skipping log sync for {}".format(
|
||||
self.local_dir))
|
||||
return
|
||||
|
||||
@@ -178,7 +178,7 @@ class _LogSyncer(object):
|
||||
if final_cmd:
|
||||
final_cmd += " && "
|
||||
final_cmd += local_to_remote_sync_cmd
|
||||
logger.info("Running log sync: {}".format(final_cmd))
|
||||
logger.debug("Running log sync: {}".format(final_cmd))
|
||||
self.sync_process = subprocess.Popen(final_cmd, shell=True)
|
||||
|
||||
def wait(self):
|
||||
|
||||
@@ -234,10 +234,16 @@ class RayTrialExecutor(TrialExecutor):
|
||||
assert self._committed_resources.cpu >= 0
|
||||
assert self._committed_resources.gpu >= 0
|
||||
|
||||
def _update_avail_resources(self):
|
||||
resources = ray.global_state.cluster_resources()
|
||||
def _update_avail_resources(self, num_retries=5):
|
||||
for i in range(num_retries):
|
||||
resources = ray.global_state.cluster_resources()
|
||||
if not resources:
|
||||
logger.warning("Cluster resources not detected. Retrying...")
|
||||
time.sleep(0.5)
|
||||
|
||||
num_cpus = resources["CPU"]
|
||||
num_gpus = resources["GPU"]
|
||||
|
||||
self._avail_resources = Resources(int(num_cpus), int(num_gpus))
|
||||
self._resources_initialized = True
|
||||
|
||||
|
||||
@@ -355,9 +355,8 @@ class Trial(object):
|
||||
result.update(done=True)
|
||||
if self.verbose and (terminate or time.time() - self.last_debug >
|
||||
DEBUG_PRINT_INTERVAL):
|
||||
logger.info("Result for {}:".format(self))
|
||||
logger.info(" {}".format(
|
||||
pretty_print(result).replace("\n", "\n ")))
|
||||
print("Result for {}:".format(self))
|
||||
print(" {}".format(pretty_print(result).replace("\n", "\n ")))
|
||||
self.last_debug = time.time()
|
||||
self.last_result = result
|
||||
self.last_update_time = time.time()
|
||||
|
||||
@@ -163,15 +163,16 @@ def run_experiments(experiments,
|
||||
queue_trials=queue_trials,
|
||||
trial_executor=trial_executor)
|
||||
|
||||
logger.info(runner.debug_string(max_debug=99999))
|
||||
print(runner.debug_string(max_debug=99999))
|
||||
|
||||
last_debug = 0
|
||||
while not runner.is_finished():
|
||||
runner.step()
|
||||
if time.time() - last_debug > DEBUG_PRINT_INTERVAL:
|
||||
logger.info(runner.debug_string())
|
||||
print(runner.debug_string())
|
||||
last_debug = time.time()
|
||||
|
||||
logger.info(runner.debug_string(max_debug=99999))
|
||||
print(runner.debug_string(max_debug=99999))
|
||||
|
||||
wait_for_log_sync()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user