[tune][docs/util] gputil check, docs (#11260)

Co-authored-by: Amog Kamsetty <amogkam@users.noreply.github.com>
This commit is contained in:
Richard Liaw
2020-10-10 00:54:31 -07:00
committed by GitHub
parent defd41aad7
commit 56f858ed1a
11 changed files with 229 additions and 98 deletions
+6 -5
View File
@@ -161,9 +161,9 @@ def DistributedTrainableCreator(
to 60 seconds.
Returns:
A trainable class object that can be passed to Tune. Resources
are automatically set within the object, so users do
not need to set `resources_per_trainable`.
type(Trainable): A trainable class object that can be passed
to Tune. Resources are automatically set within the object, so
users do not need to set `resources_per_trainable`.
Example:
@@ -214,8 +214,9 @@ def distributed_checkpoint_dir(
disable (bool): Disable for prototyping.
Yields:
path (str): A path to a directory. This path will be used
again when invoking the training_function.
str: A path to a directory. This path will be used
again when invoking the training_function.
Example:
.. code-block:: python
+8
View File
@@ -111,6 +111,14 @@ def checkpoint_dir(step):
Store any files related to restoring state within the
provided checkpoint dir.
You should call this *before* calling ``tune.report``. The reason is
because we want checkpoints to be correlated with the result
(i.e., be able to retrieve the best checkpoint, etc). Many algorithms
depend on this behavior too.
Calling ``checkpoint_dir`` after report could introduce
inconsistencies.
Args:
step (int): Index for the checkpoint. Expected to be a
monotonically increasing quantity.
+2 -2
View File
@@ -3,12 +3,12 @@ from ray.tune.utils.util import (
pin_in_object_store, unflattened_lookup, UtilMonitor,
validate_save_restore, warn_if_slow, diagnose_serialization,
detect_checkpoint_function, detect_reporter, detect_config_single,
env_integer)
env_integer, wait_for_gpu)
__all__ = [
"deep_update", "date_str", "flatten_dict", "get_pinned_object",
"merge_dicts", "pin_in_object_store", "unflattened_lookup", "UtilMonitor",
"validate_save_restore", "warn_if_slow", "diagnose_serialization",
"detect_checkpoint_function", "detect_reporter", "detect_config_single",
"env_integer"
"env_integer", "wait_for_gpu"
]
+58 -4
View File
@@ -311,18 +311,19 @@ def _from_pinnable(obj):
def diagnose_serialization(trainable):
"""Utility for detecting accidentally-scoped objects.
"""Utility for detecting why your trainable function isn't serializing.
Args:
trainable (cls | func): The trainable object passed to
tune.run(trainable).
trainable (func): The trainable object passed to
tune.run(trainable). Currently only supports
Function API.
Returns:
bool | set of unserializable objects.
Example:
.. code-block::
.. code-block:: python
import threading
# this is not serializable
@@ -396,6 +397,59 @@ def diagnose_serialization(trainable):
return failure_set
def wait_for_gpu(gpu_id=None, gpu_memory_limit=0.1, retry=20):
"""Checks if a given GPU has freed memory.
Requires ``gputil`` to be installed: ``pip install gputil``.
Args:
gpu_id (Optional[str]): GPU id to check. Must be found
within GPUtil.getGPUs(). If none, resorts to
the first item returned from `ray.get_gpu_ids()`.
gpu_memory_limit (float): If memory usage is below
this quantity, the check will break.
retry (int): Number of times to check GPU limit. Sleeps 5
seconds between checks.
Returns:
bool
True if free.
Raises:
RuntimeError
If GPUtil is not found, if no GPUs are detected
or if the check fails.
Example:
.. code-block:: python
def tune_func(config):
tune.util.wait_for_gpu()
train()
tune.run(tune_func, resources_per_trial={"GPU": 1}, num_samples=10)
"""
if GPUtil is None:
raise RuntimeError(
"GPUtil must be installed if calling `wait_for_gpu`.")
if not gpu_id:
gpu_id_list = ray.get_gpu_ids()
if not gpu_id_list:
raise RuntimeError(f"No GPU ids found from {ray.get_gpu_ids()}. "
"Did you set Tune resources correctly?")
gpu_id = gpu_id_list[0]
gpu_object = GPUtil.getGPUs()[gpu_id]
for i in range(int(retry)):
if gpu_object.memoryUsed > gpu_memory_limit:
logger.info(f"Waiting for GPU {gpu_id} memory to free. "
f"Mem: {gpu_object.memoryUsed:0.3f}")
time.sleep(5)
else:
return True
raise RuntimeError("GPU memory was not freed.")
def validate_save_restore(trainable_cls,
config=None,
num_gpus=0,