mirror of
https://github.com/wassname/ray.git
synced 2026-07-02 06:12:09 +08:00
[tune][docs/util] gputil check, docs (#11260)
Co-authored-by: Amog Kamsetty <amogkam@users.noreply.github.com>
This commit is contained in:
@@ -161,9 +161,9 @@ def DistributedTrainableCreator(
|
||||
to 60 seconds.
|
||||
|
||||
Returns:
|
||||
A trainable class object that can be passed to Tune. Resources
|
||||
are automatically set within the object, so users do
|
||||
not need to set `resources_per_trainable`.
|
||||
type(Trainable): A trainable class object that can be passed
|
||||
to Tune. Resources are automatically set within the object, so
|
||||
users do not need to set `resources_per_trainable`.
|
||||
|
||||
Example:
|
||||
|
||||
@@ -214,8 +214,9 @@ def distributed_checkpoint_dir(
|
||||
disable (bool): Disable for prototyping.
|
||||
|
||||
Yields:
|
||||
path (str): A path to a directory. This path will be used
|
||||
again when invoking the training_function.
|
||||
str: A path to a directory. This path will be used
|
||||
again when invoking the training_function.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@@ -111,6 +111,14 @@ def checkpoint_dir(step):
|
||||
Store any files related to restoring state within the
|
||||
provided checkpoint dir.
|
||||
|
||||
You should call this *before* calling ``tune.report``. The reason is
|
||||
because we want checkpoints to be correlated with the result
|
||||
(i.e., be able to retrieve the best checkpoint, etc). Many algorithms
|
||||
depend on this behavior too.
|
||||
|
||||
Calling ``checkpoint_dir`` after report could introduce
|
||||
inconsistencies.
|
||||
|
||||
Args:
|
||||
step (int): Index for the checkpoint. Expected to be a
|
||||
monotonically increasing quantity.
|
||||
|
||||
@@ -3,12 +3,12 @@ from ray.tune.utils.util import (
|
||||
pin_in_object_store, unflattened_lookup, UtilMonitor,
|
||||
validate_save_restore, warn_if_slow, diagnose_serialization,
|
||||
detect_checkpoint_function, detect_reporter, detect_config_single,
|
||||
env_integer)
|
||||
env_integer, wait_for_gpu)
|
||||
|
||||
__all__ = [
|
||||
"deep_update", "date_str", "flatten_dict", "get_pinned_object",
|
||||
"merge_dicts", "pin_in_object_store", "unflattened_lookup", "UtilMonitor",
|
||||
"validate_save_restore", "warn_if_slow", "diagnose_serialization",
|
||||
"detect_checkpoint_function", "detect_reporter", "detect_config_single",
|
||||
"env_integer"
|
||||
"env_integer", "wait_for_gpu"
|
||||
]
|
||||
|
||||
@@ -311,18 +311,19 @@ def _from_pinnable(obj):
|
||||
|
||||
|
||||
def diagnose_serialization(trainable):
|
||||
"""Utility for detecting accidentally-scoped objects.
|
||||
"""Utility for detecting why your trainable function isn't serializing.
|
||||
|
||||
Args:
|
||||
trainable (cls | func): The trainable object passed to
|
||||
tune.run(trainable).
|
||||
trainable (func): The trainable object passed to
|
||||
tune.run(trainable). Currently only supports
|
||||
Function API.
|
||||
|
||||
Returns:
|
||||
bool | set of unserializable objects.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block::
|
||||
.. code-block:: python
|
||||
|
||||
import threading
|
||||
# this is not serializable
|
||||
@@ -396,6 +397,59 @@ def diagnose_serialization(trainable):
|
||||
return failure_set
|
||||
|
||||
|
||||
def wait_for_gpu(gpu_id=None, gpu_memory_limit=0.1, retry=20):
|
||||
"""Checks if a given GPU has freed memory.
|
||||
|
||||
Requires ``gputil`` to be installed: ``pip install gputil``.
|
||||
|
||||
Args:
|
||||
gpu_id (Optional[str]): GPU id to check. Must be found
|
||||
within GPUtil.getGPUs(). If none, resorts to
|
||||
the first item returned from `ray.get_gpu_ids()`.
|
||||
gpu_memory_limit (float): If memory usage is below
|
||||
this quantity, the check will break.
|
||||
retry (int): Number of times to check GPU limit. Sleeps 5
|
||||
seconds between checks.
|
||||
|
||||
Returns:
|
||||
bool
|
||||
True if free.
|
||||
|
||||
Raises:
|
||||
RuntimeError
|
||||
If GPUtil is not found, if no GPUs are detected
|
||||
or if the check fails.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def tune_func(config):
|
||||
tune.util.wait_for_gpu()
|
||||
train()
|
||||
|
||||
tune.run(tune_func, resources_per_trial={"GPU": 1}, num_samples=10)
|
||||
"""
|
||||
if GPUtil is None:
|
||||
raise RuntimeError(
|
||||
"GPUtil must be installed if calling `wait_for_gpu`.")
|
||||
if not gpu_id:
|
||||
gpu_id_list = ray.get_gpu_ids()
|
||||
if not gpu_id_list:
|
||||
raise RuntimeError(f"No GPU ids found from {ray.get_gpu_ids()}. "
|
||||
"Did you set Tune resources correctly?")
|
||||
gpu_id = gpu_id_list[0]
|
||||
gpu_object = GPUtil.getGPUs()[gpu_id]
|
||||
for i in range(int(retry)):
|
||||
if gpu_object.memoryUsed > gpu_memory_limit:
|
||||
logger.info(f"Waiting for GPU {gpu_id} memory to free. "
|
||||
f"Mem: {gpu_object.memoryUsed:0.3f}")
|
||||
time.sleep(5)
|
||||
else:
|
||||
return True
|
||||
raise RuntimeError("GPU memory was not freed.")
|
||||
|
||||
|
||||
def validate_save_restore(trainable_cls,
|
||||
config=None,
|
||||
num_gpus=0,
|
||||
|
||||
Reference in New Issue
Block a user