[rllib] Raise an error if GPUs are enabled but not tf.test.is_gpu_available() (#6365)

This commit is contained in:
Eric Liang
2019-12-05 10:13:54 -08:00
committed by GitHub
parent 668ce47360
commit 4c6739476b
3 changed files with 13 additions and 15 deletions
-8
View File
@@ -55,14 +55,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
--stop '{"training_iteration": 1}' \
--config '{"simple_optimizer": true, "num_sgd_iter": 2, "model": {"use_lstm": true}}'
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
/ray/ci/suppress_output --force-direct /ray/rllib/train.py \
--env CartPole-v1 \
--run PPO \
--stop '{"training_iteration": 1}' \
--config '{"num_gpus": 0.1}' \
--ray-num-gpus 1
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
/ray/ci/suppress_output --force-direct /ray/rllib/train.py \
--env CartPole-v1 \
+1 -1
View File
@@ -118,7 +118,7 @@ class RayTrialExecutor(TrialExecutor):
# Clear the Trial's location (to be updated later on result)
# since we don't know where the remote runner is placed.
trial.set_location(Location())
logger.info("Trial %s: Setting up new remote runner.", trial)
logger.debug("Trial %s: Setting up new remote runner.", trial)
# Logging for trials is handled centrally by TrialRunner, so
# configure the remote runner to use a noop-logger.
return cls.remote(config=trial.config, logger_creator=logger_creator)
+12 -6
View File
@@ -327,14 +327,20 @@ class RolloutWorker(EvaluatorInterface):
logger.info("Could not seed torch")
if _has_tensorflow_graph(policy_dict) and not (tf and
tf.executing_eagerly()):
if (ray.is_initialized()
and ray.worker._mode() != ray.worker.LOCAL_MODE
and not ray.get_gpu_ids()):
logger.debug("Creating policy evaluation worker {}".format(
worker_index) +
" on CPU (please ignore any CUDA init errors)")
if not tf:
raise ImportError("Could not import tensorflow")
if (ray.is_initialized()
and ray.worker._mode() != ray.worker.LOCAL_MODE):
if not ray.get_gpu_ids():
logger.debug(
"Creating policy evaluation worker {}".format(
worker_index) +
" on CPU (please ignore any CUDA init errors)")
elif not tf.test.is_gpu_available():
raise RuntimeError(
"GPUs were assigned to this worker by Ray, but "
"TensorFlow reports GPU acceleration is disabled. "
"This could be due to a bad CUDA or TF installation.")
with tf.Graph().as_default():
if tf_session_creator:
self.tf_sess = tf_session_creator()