diff --git a/python/ray/tune/examples/async_hyperband_example.py b/python/ray/tune/examples/async_hyperband_example.py index fcd4fda4b..a5a5bb4e0 100644 --- a/python/ray/tune/examples/async_hyperband_example.py +++ b/python/ray/tune/examples/async_hyperband_example.py @@ -50,8 +50,11 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", action="store_true", help="Finish quickly for testing") + parser.add_argument( + "--ray-redis-address", + help="Address of Ray cluster for seamless distributed execution.") args, _ = parser.parse_known_args() - ray.init() + ray.init(redis_address=args.ray_redis_address) # asynchronous hyperband early stopping, configured with # `episode_reward_mean` as the diff --git a/python/ray/tune/log_sync.py b/python/ray/tune/log_sync.py index 6888ae6a8..29c39d113 100644 --- a/python/ray/tune/log_sync.py +++ b/python/ray/tune/log_sync.py @@ -55,7 +55,7 @@ class NodeSyncMixin(): def _check_valid_worker_ip(self): if not self.worker_ip: - logger.info("Worker ip unknown, skipping log sync for {}".format( + logger.debug("Worker ip unknown, skipping log sync for {}".format( self._local_dir)) return False if self.worker_ip == self.local_ip: diff --git a/python/ray/tune/ray_trial_executor.py b/python/ray/tune/ray_trial_executor.py index 4a17bc581..f2ac6a68f 100644 --- a/python/ray/tune/ray_trial_executor.py +++ b/python/ray/tune/ray_trial_executor.py @@ -180,11 +180,8 @@ class RayTrialExecutor(TrialExecutor): logger.debug("Reusing actor for {}".format(trial.runner)) self._cached_actor = trial.runner else: - logger.info( - "Destroying actor for trial {}. If your trainable is " - "slow to initialize, consider setting " - "reuse_actors=True to reduce actor creation " - "overheads.".format(trial)) + logger.debug( + "Destroying actor for trial {}.".format(trial)) trial.runner.stop.remote() trial.runner.__ray_terminate__.remote() except Exception: diff --git a/python/ray/tune/trainable.py b/python/ray/tune/trainable.py index f5df41095..04ebbf527 100644 --- a/python/ray/tune/trainable.py +++ b/python/ray/tune/trainable.py @@ -26,6 +26,8 @@ from ray.tune.util import UtilMonitor logger = logging.getLogger(__name__) +SETUP_TIME_THRESHOLD = 10 + class Trainable(object): """Abstract class for trainable models, functions, etc. @@ -93,7 +95,14 @@ class Trainable(object): self._timesteps_since_restore = 0 self._iterations_since_restore = 0 self._restored = False + start_time = time.time() self._setup(copy.deepcopy(self.config)) + setup_time = time.time() - start_time + if setup_time > SETUP_TIME_THRESHOLD: + logger.info("_setup took {:.3f} seconds. If your trainable is " + "slow to initialize, consider setting " + "reuse_actors=True to reduce actor creation " + "overheads.".format(setup_time)) self._local_ip = ray.services.get_node_ip_address() self._monitor = UtilMonitor(start=log_sys_usage) diff --git a/python/ray/tune/util.py b/python/ray/tune/util.py index 0c53a14ad..a6c122428 100644 --- a/python/ray/tune/util.py +++ b/python/ray/tune/util.py @@ -43,10 +43,10 @@ class UtilMonitor(Thread): def __init__(self, start=True, delay=0.7): self.stopped = True - if GPUtil is None: + if GPUtil is None and start: logger.warning("Install gputil for GPU system monitoring.") - if psutil is None: + if psutil is None and start: logger.warning("Install psutil to monitor system performance.") if GPUtil is None and psutil is None: