diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh index 4411c410b..07f34b11e 100755 --- a/ci/jenkins_tests/run_rllib_tests.sh +++ b/ci/jenkins_tests/run_rllib_tests.sh @@ -55,14 +55,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ --stop '{"training_iteration": 1}' \ --config '{"simple_optimizer": true, "num_sgd_iter": 2, "model": {"use_lstm": true}}' -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - /ray/ci/suppress_output --force-direct /ray/rllib/train.py \ - --env CartPole-v1 \ - --run PPO \ - --stop '{"training_iteration": 1}' \ - --config '{"num_gpus": 0.1}' \ - --ray-num-gpus 1 - docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output --force-direct /ray/rllib/train.py \ --env CartPole-v1 \ diff --git a/python/ray/tune/ray_trial_executor.py b/python/ray/tune/ray_trial_executor.py index 4ad318515..756059395 100644 --- a/python/ray/tune/ray_trial_executor.py +++ b/python/ray/tune/ray_trial_executor.py @@ -118,7 +118,7 @@ class RayTrialExecutor(TrialExecutor): # Clear the Trial's location (to be updated later on result) # since we don't know where the remote runner is placed. trial.set_location(Location()) - logger.info("Trial %s: Setting up new remote runner.", trial) + logger.debug("Trial %s: Setting up new remote runner.", trial) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. return cls.remote(config=trial.config, logger_creator=logger_creator) diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index ac9edd344..10beb3c50 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -327,14 +327,20 @@ class RolloutWorker(EvaluatorInterface): logger.info("Could not seed torch") if _has_tensorflow_graph(policy_dict) and not (tf and tf.executing_eagerly()): - if (ray.is_initialized() - and ray.worker._mode() != ray.worker.LOCAL_MODE - and not ray.get_gpu_ids()): - logger.debug("Creating policy evaluation worker {}".format( - worker_index) + - " on CPU (please ignore any CUDA init errors)") if not tf: raise ImportError("Could not import tensorflow") + if (ray.is_initialized() + and ray.worker._mode() != ray.worker.LOCAL_MODE): + if not ray.get_gpu_ids(): + logger.debug( + "Creating policy evaluation worker {}".format( + worker_index) + + " on CPU (please ignore any CUDA init errors)") + elif not tf.test.is_gpu_available(): + raise RuntimeError( + "GPUs were assigned to this worker by Ray, but " + "TensorFlow reports GPU acceleration is disabled. " + "This could be due to a bad CUDA or TF installation.") with tf.Graph().as_default(): if tf_session_creator: self.tf_sess = tf_session_creator()