[rllib] Raise an error if GPUs are enabled but not tf.test.is_gpu_available() (#6365)

2026-07-03 15:44:37 +08:00 · 2019-12-05 10:13:54 -08:00
parent 668ce47360
commit 4c6739476b
3 changed files with 13 additions and 15 deletions
@@ -55,14 +55,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
    --stop '{"training_iteration": 1}' \
    --config '{"simple_optimizer": true, "num_sgd_iter": 2, "model": {"use_lstm": true}}'

-docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    /ray/ci/suppress_output --force-direct /ray/rllib/train.py \
-    --env CartPole-v1 \
-    --run PPO \
-    --stop '{"training_iteration": 1}' \
-    --config '{"num_gpus": 0.1}' \
-    --ray-num-gpus 1
-
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
    /ray/ci/suppress_output --force-direct /ray/rllib/train.py \
    --env CartPole-v1 \
@@ -118,7 +118,7 @@ class RayTrialExecutor(TrialExecutor):
        # Clear the Trial's location (to be updated later on result)
        # since we don't know where the remote runner is placed.
        trial.set_location(Location())
-        logger.info("Trial %s: Setting up new remote runner.", trial)
+        logger.debug("Trial %s: Setting up new remote runner.", trial)
        # Logging for trials is handled centrally by TrialRunner, so
        # configure the remote runner to use a noop-logger.
        return cls.remote(config=trial.config, logger_creator=logger_creator)
@@ -327,14 +327,20 @@ class RolloutWorker(EvaluatorInterface):
                logger.info("Could not seed torch")
        if _has_tensorflow_graph(policy_dict) and not (tf and
                                                       tf.executing_eagerly()):
-            if (ray.is_initialized()
-                    and ray.worker._mode() != ray.worker.LOCAL_MODE
-                    and not ray.get_gpu_ids()):
-                logger.debug("Creating policy evaluation worker {}".format(
-                    worker_index) +
-                             " on CPU (please ignore any CUDA init errors)")
            if not tf:
                raise ImportError("Could not import tensorflow")
+            if (ray.is_initialized()
+                    and ray.worker._mode() != ray.worker.LOCAL_MODE):
+                if not ray.get_gpu_ids():
+                    logger.debug(
+                        "Creating policy evaluation worker {}".format(
+                            worker_index) +
+                        " on CPU (please ignore any CUDA init errors)")
+                elif not tf.test.is_gpu_available():
+                    raise RuntimeError(
+                        "GPUs were assigned to this worker by Ray, but "
+                        "TensorFlow reports GPU acceleration is disabled. "
+                        "This could be due to a bad CUDA or TF installation.")
            with tf.Graph().as_default():
                if tf_session_creator:
                    self.tf_sess = tf_session_creator()