mirror of
https://github.com/wassname/ray.git
synced 2026-07-03 15:44:37 +08:00
[rllib] Raise an error if GPUs are enabled but not tf.test.is_gpu_available() (#6365)
This commit is contained in:
@@ -55,14 +55,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
|
||||
--stop '{"training_iteration": 1}' \
|
||||
--config '{"simple_optimizer": true, "num_sgd_iter": 2, "model": {"use_lstm": true}}'
|
||||
|
||||
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
|
||||
/ray/ci/suppress_output --force-direct /ray/rllib/train.py \
|
||||
--env CartPole-v1 \
|
||||
--run PPO \
|
||||
--stop '{"training_iteration": 1}' \
|
||||
--config '{"num_gpus": 0.1}' \
|
||||
--ray-num-gpus 1
|
||||
|
||||
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
|
||||
/ray/ci/suppress_output --force-direct /ray/rllib/train.py \
|
||||
--env CartPole-v1 \
|
||||
|
||||
@@ -118,7 +118,7 @@ class RayTrialExecutor(TrialExecutor):
|
||||
# Clear the Trial's location (to be updated later on result)
|
||||
# since we don't know where the remote runner is placed.
|
||||
trial.set_location(Location())
|
||||
logger.info("Trial %s: Setting up new remote runner.", trial)
|
||||
logger.debug("Trial %s: Setting up new remote runner.", trial)
|
||||
# Logging for trials is handled centrally by TrialRunner, so
|
||||
# configure the remote runner to use a noop-logger.
|
||||
return cls.remote(config=trial.config, logger_creator=logger_creator)
|
||||
|
||||
@@ -327,14 +327,20 @@ class RolloutWorker(EvaluatorInterface):
|
||||
logger.info("Could not seed torch")
|
||||
if _has_tensorflow_graph(policy_dict) and not (tf and
|
||||
tf.executing_eagerly()):
|
||||
if (ray.is_initialized()
|
||||
and ray.worker._mode() != ray.worker.LOCAL_MODE
|
||||
and not ray.get_gpu_ids()):
|
||||
logger.debug("Creating policy evaluation worker {}".format(
|
||||
worker_index) +
|
||||
" on CPU (please ignore any CUDA init errors)")
|
||||
if not tf:
|
||||
raise ImportError("Could not import tensorflow")
|
||||
if (ray.is_initialized()
|
||||
and ray.worker._mode() != ray.worker.LOCAL_MODE):
|
||||
if not ray.get_gpu_ids():
|
||||
logger.debug(
|
||||
"Creating policy evaluation worker {}".format(
|
||||
worker_index) +
|
||||
" on CPU (please ignore any CUDA init errors)")
|
||||
elif not tf.test.is_gpu_available():
|
||||
raise RuntimeError(
|
||||
"GPUs were assigned to this worker by Ray, but "
|
||||
"TensorFlow reports GPU acceleration is disabled. "
|
||||
"This could be due to a bad CUDA or TF installation.")
|
||||
with tf.Graph().as_default():
|
||||
if tf_session_creator:
|
||||
self.tf_sess = tf_session_creator()
|
||||
|
||||
Reference in New Issue
Block a user