From 56f858ed1aa5936f5ae512a718509fe886c4092d Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Sat, 10 Oct 2020 00:54:31 -0700
Subject: [PATCH] [tune][docs/util] gputil check, docs (#11260)

Co-authored-by: Amog Kamsetty <amogkam@users.noreply.github.com>
---
 doc/source/tune/_tutorials/_faq.rst     |  53 ++++++------
 doc/source/tune/_tutorials/overview.rst |   2 +-
 doc/source/tune/api_docs/overview.rst   |   2 +-
 doc/source/tune/api_docs/trainable.rst  |  16 +++-
 doc/source/tune/index.rst               |   4 +-
 doc/source/tune/key-concepts.rst        |  62 ++++++++------
 doc/source/tune/user-guide.rst          | 103 ++++++++++++++++--------
 python/ray/tune/integration/torch.py    |  11 +--
 python/ray/tune/session.py              |   8 ++
 python/ray/tune/utils/__init__.py       |   4 +-
 python/ray/tune/utils/util.py           |  62 +++++++++++++-
 11 files changed, 229 insertions(+), 98 deletions(-)

diff --git a/doc/source/tune/_tutorials/_faq.rst b/doc/source/tune/_tutorials/_faq.rst
index 9358eddfa..a796b4686 100644
--- a/doc/source/tune/_tutorials/_faq.rst
+++ b/doc/source/tune/_tutorials/_faq.rst
@@ -144,6 +144,9 @@ train each trial at least for ``n`` epochs.
 
 Why are all my trials returning "1" iteration?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**This is most likely applicable for the Tune function API.**
+
 Ray Tune counts iterations internally every time ``tune.report()`` is
 called. If you only call ``tune.report()`` once at the end of the training,
 the counter has only been incremented once. If you're using the class API,
@@ -156,33 +159,34 @@ like Hyperband/ASHA can terminate bad performing trials early.
 
 What are all these extra outputs?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 You'll notice that Ray Tune not only reports hyperparameters (from the
 ``config``) or metrics (passed to ``tune.report()``), but also some other
-outputs. The ``Trial.last_result`` dictionary contains the following
-additional outputs:
+outputs.
 
-* ``config``: The hyperparameter configuration
-* ``date``: String-formatted date and time when the result was processed
-* ``done``: True if the trial has been finished, False otherwise
-* ``episodes_total``: Total number of episodes (for RLLib trainables)
-* ``experiment_id``: Unique experiment ID
-* ``experiment_tag``: Unique experiment tag (includes parameter values)
-* ``hostname``: Hostname of the worker
-* ``iterations_since_restore``: The number of times ``tune.report()`` has been
-  called after restoring the run from a checkpoint
-* ``node_ip``: Host IP of the worker
-* ``pid``: Process ID (PID) of the worker process
-* ``time_since_restore``: Time in seconds since restoring from a checkpoint.
-* ``time_this_iter_s``: Runtime of the current training iteration in seconds (i.e.
-  one call to the trainable function or to ``_train()`` in the class API.
-* ``time_total_s``: Total runtime in seconds.
-* ``timestamp``: Timestamp when the result was processed
-* ``timesteps_since_restore``: Number of timesteps since restoring from a checkpoint
-* ``timesteps_total``: Total number of timesteps
-* ``training_iteration``: The number of times ``tune.report()`` has been
-  called
-* ``trial_id``: Unique trial ID
+.. code-block:: bash
 
+    Result for easy_objective_c64c9112:
+      date: 2020-10-07_13-29-18
+      done: false
+      experiment_id: 6edc31257b564bf8985afeec1df618ee
+      experiment_tag: 7_activation=tanh,height=-53.116,steps=100,width=13.885
+      hostname: ubuntu
+      iterations: 0
+      iterations_since_restore: 1
+      mean_loss: 4.688385317424468
+      neg_mean_loss: -4.688385317424468
+      node_ip: 192.168.1.115
+      pid: 5973
+      time_since_restore: 7.605552673339844e-05
+      time_this_iter_s: 7.605552673339844e-05
+      time_total_s: 7.605552673339844e-05
+      timestamp: 1602102558
+      timesteps_since_restore: 0
+      training_iteration: 1
+      trial_id: c64c9112
+
+See the :ref:`tune-autofilled-metrics` section for a glossary.
 
 How do I set resources?
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -223,6 +227,9 @@ has machines that can actually fulfill your resource requests.
 
 How can I pass further parameter values to my trainable function?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**This is only applicable for the Tune function API.**
+
 Ray Tune expects your trainable functions to accept only up to two parameters,
 ``config`` and ``checkpoint_dir``. But sometimes there are cases where
 you want to pass constant arguments, like the number of epochs to run,
diff --git a/doc/source/tune/_tutorials/overview.rst b/doc/source/tune/_tutorials/overview.rst
index 5aa6360be..dfbd986b0 100644
--- a/doc/source/tune/_tutorials/overview.rst
+++ b/doc/source/tune/_tutorials/overview.rst
@@ -4,7 +4,7 @@
 Tutorials & FAQ
 ===============
 
-.. tip:: We'd love to hear your feedback on using Tune - fill out a `short survey <https://forms.gle/PTRvGLbKRdUfuzQo9>`_!
+.. tip:: We'd love to hear your feedback on using Tune - `get in touch <https://forms.gle/PTRvGLbKRdUfuzQo9>`_!
 
 In this section, you can find material on how to use Tune and its various features. If any of the materials is out of date or broken, or if you'd like to add an example to this page, feel free to raise an issue on our Github repository.
 
diff --git a/doc/source/tune/api_docs/overview.rst b/doc/source/tune/api_docs/overview.rst
index 66b5b955e..c8cc034a6 100644
--- a/doc/source/tune/api_docs/overview.rst
+++ b/doc/source/tune/api_docs/overview.rst
@@ -3,7 +3,7 @@
 Tune API Reference
 ==================
 
-.. tip:: We'd love to hear your feedback on using Tune - fill out a `short survey <https://forms.gle/PTRvGLbKRdUfuzQo9>`_!
+.. tip:: We'd love to hear your feedback on using Tune - `get in touch <https://forms.gle/PTRvGLbKRdUfuzQo9>`_!
 
 This section contains a reference for the Tune API. If there is anything missing, please open an issue
 on `Github`_.
diff --git a/doc/source/tune/api_docs/trainable.rst b/doc/source/tune/api_docs/trainable.rst
index 55224aacb..492639400 100644
--- a/doc/source/tune/api_docs/trainable.rst
+++ b/doc/source/tune/api_docs/trainable.rst
@@ -17,7 +17,7 @@ For the sake of example, let's maximize this objective function:
 Function API
 ------------
 
-Here is a simple example of using the function API. You can report intermediate metrics by simply calling ``tune.report`` within the provided function.
+With the Function API, you can report intermediate metrics by simply calling ``tune.report`` within the provided function.
 
 .. code-block:: python
 
@@ -40,6 +40,8 @@ Here is a simple example of using the function API. You can report intermediate
 
 Tune will run this function on a separate thread in a Ray actor process.
 
+You'll notice that Ray Tune will output extra values in addition to the user reported metrics, such as ``iterations_since_restore``. See :ref:`tune-autofilled-metrics` for an explanation/glossary of these values.
+
 .. tip:: If you want to leverage multi-node data parallel training with PyTorch while using parallel hyperparameter tuning, check out our :ref:`PyTorch <tune-pytorch-cifar>` user guide and Tune's :ref:`distributed pytorch integrations <tune-integration-torch>`.
 
 Function API return and yield values
@@ -182,6 +184,7 @@ As a subclass of ``tune.Trainable``, Tune will create a ``Trainable`` object on
 
 .. tip:: As a rule of thumb, the execution time of ``step`` should be large enough to avoid overheads (i.e. more than a few seconds), but short enough to report progress periodically (i.e. at most a few minutes).
 
+You'll notice that Ray Tune will output extra values in addition to the user reported metrics, such as ``iterations_since_restore``. See :ref:`tune-autofilled-metrics` for an explanation/glossary of these values.
 
 .. _tune-trainable-save-restore:
 
@@ -332,6 +335,17 @@ tune.Trainable (Class API)
     :private-members:
     :members:
 
+.. _tune-util-ref:
+
+Utilities
+---------
+
+.. autofunction:: ray.tune.utils.wait_for_gpu
+
+.. autofunction:: ray.tune.utils.diagnose_serialization
+
+.. autofunction:: ray.tune.utils.validate_save_restore
+
 
 .. _tune-ddp-doc:
 
diff --git a/doc/source/tune/index.rst b/doc/source/tune/index.rst
index f00142f57..cbc974dab 100644
--- a/doc/source/tune/index.rst
+++ b/doc/source/tune/index.rst
@@ -1,6 +1,8 @@
 Tune: Scalable Hyperparameter Tuning
 ====================================
 
+.. tip:: We'd love to hear your feedback on using Tune - `get in touch <https://forms.gle/PTRvGLbKRdUfuzQo9>`_!
+
 .. image:: /images/tune.png
     :scale: 30%
     :align: center
@@ -17,8 +19,6 @@ Tune is a Python library for experiment execution and hyperparameter tuning at a
 
 **Want to get started?** Head over to the :doc:`Key Concepts page </tune/key-concepts>`.
 
-.. tip:: We'd love to hear your feedback on using Tune - fill out a `short survey <https://forms.gle/PTRvGLbKRdUfuzQo9>`_!
-
 .. tip:: Join the `Ray community slack <https://forms.gle/9TSdDYUgxYs8SA9e8>`_ to discuss Ray Tune (and other Ray libraries)!
 
 
diff --git a/doc/source/tune/key-concepts.rst b/doc/source/tune/key-concepts.rst
index 648df7345..be194b5a2 100644
--- a/doc/source/tune/key-concepts.rst
+++ b/doc/source/tune/key-concepts.rst
@@ -15,62 +15,72 @@ Let's quickly walk through the key concepts you need to know to use Tune. In thi
 Trainables
 ----------
 
-Tune will optimize your training process using the :ref:`Trainable API <trainable-docs>`. To start, let's try to maximize this objective function:
+To start, let's try to maximize this objective function:
 
 .. code-block:: python
 
     def objective(x, a, b):
         return a * (x ** 0.5) + b
 
-Here's an example of specifying the objective function using :ref:`the function-based Trainable API <tune-function-api>`:
+To use Tune, you will need to wrap this function in a lightweight :ref:`trainable API <trainable-docs>`. You can either use a :ref:`function-based version <tune-function-api>` or a :ref:`class-based version <tune-class-api>`.
 
-.. code-block:: python
+.. tabs::
+    .. group-tab:: Function API
 
-    def trainable(config):
-        # config (dict): A dict of hyperparameters.
+        Here's an example of specifying the objective function using :ref:`the function-based Trainable API <tune-function-api>`:
 
-        for x in range(20):
-            score = objective(x, config["a"], config["b"])
+        .. code-block:: python
 
-            tune.report(score=score)  # This sends the score to Tune.
+            def trainable(config):
+                # config (dict): A dict of hyperparameters.
 
-Now, there's two Trainable APIs - one being the :ref:`function-based API <tune-function-api>` that we demonstrated above.
+                for x in range(20):
+                    score = objective(x, config["a"], config["b"])
 
-The other is a :ref:`class-based API <tune-class-api>`. Here's an example of specifying the objective function using the :ref:`class-based API <tune-class-api>`:
+                    tune.report(score=score)  # This sends the score to Tune.
 
-.. code-block:: python
+    .. group-tab:: Class API
 
-    from ray import tune
+        Here's an example of specifying the objective function using the :ref:`class-based API <tune-class-api>`:
 
-    class Trainable(tune.Trainable):
-        def setup(self, config):
-            # config (dict): A dict of hyperparameters
-            self.x = 0
-            self.a = config["a"]
-            self.b = config["b"]
+        .. code-block:: python
 
-        def step(self):  # This is called iteratively.
-            score = objective(self.x, self.a, self.b)
-            self.x += 1
-            return {"score": score}
+            from ray import tune
 
-.. tip:: Do not use ``tune.report`` within a ``Trainable`` class.
+            class Trainable(tune.Trainable):
+                def setup(self, config):
+                    # config (dict): A dict of hyperparameters
+                    self.x = 0
+                    self.a = config["a"]
+                    self.b = config["b"]
+
+                def step(self):  # This is called iteratively.
+                    score = objective(self.x, self.a, self.b)
+                    self.x += 1
+                    return {"score": score}
+
+        .. tip:: Do not use ``tune.report`` within a ``Trainable`` class.
 
 See the documentation: :ref:`trainable-docs` and :ref:`examples <tune-general-examples>`.
 
 tune.run and Trials
 -------------------
 
-Use ``tune.run`` execute hyperparameter tuning using the core Ray APIs. This function manages your experiment and provides many features such as :ref:`logging <tune-logging>`, :ref:`checkpointing <tune-checkpoint>`, and :ref:`early stopping <tune-stopping>`.
+Use :ref:`tune.run <tune-run-ref>` to execute hyperparameter tuning. This function manages your experiment and provides many features such as :ref:`logging <tune-logging>`, :ref:`checkpointing <tune-checkpoint>`, and :ref:`early stopping <tune-stopping>`.
 
 .. code-block:: python
 
     # Pass in a Trainable class or function to tune.run.
     tune.run(trainable)
 
-``tune.run`` will generate a couple hyperparameter configurations from its arguments, and each hyperparameter configuration is logically represented by a Trial object.
+``tune.run`` will generate a couple hyperparameter configurations from its arguments, wrapping them into :ref:`Trial objects <trial-docstring>`.
 
-Each trial has a resource specification (``resources_per_trial`` or ``trial.resources``), a hyperparameter configuration (``trial.config``), id (``trial.trial_id``), among other configuration values. Each trial is also associated with one instance of a :ref:`Trainable <trainable-docs>`. You can access trial objects through the :ref:`Analysis object <tune-concepts-analysis>` provided after ``tune.run`` finishes.
+Each trial has
+* a hyperparameter configuration (``trial.config``), id (``trial.trial_id``)
+* a resource specification (``resources_per_trial`` or ``trial.resources``)
+* And other configuration values.
+
+Each trial is also associated with one instance of a :ref:`Trainable <trainable-docs>`. You can access trial objects through the :ref:`Analysis object <tune-concepts-analysis>` provided after ``tune.run`` finishes.
 
 ``tune.run`` will execute until all trials stop or error:
 
diff --git a/doc/source/tune/user-guide.rst b/doc/source/tune/user-guide.rst
index 111f166e6..ce44419ab 100644
--- a/doc/source/tune/user-guide.rst
+++ b/doc/source/tune/user-guide.rst
@@ -9,19 +9,15 @@ These pages will demonstrate the various features and configurations of Tune.
 
 This document provides an overview of the core concepts as well as some of the configurations for running Tune.
 
-.. contents:: :local:
-
 .. _tune-parallelism:
 
-Parallelism / GPUs
-------------------
+Resources (Parallelism, GPUs, Distributed)
+------------------------------------------
 
 .. tip:: To run everything sequentially, use :ref:`Ray Local Mode <tune-debugging>`.
 
 Parallelism is determined by ``resources_per_trial`` (defaulting to 1 CPU, 0 GPU per trial) and the resources available to Tune (``ray.cluster_resources()``).
 
-Tune will allocate the specified GPU and CPU from ``resources_per_trial`` to each individual trial. A trial will not be scheduled unless at least that amount of resources is available, preventing the cluster from being overloaded.
-
 By default, Tune automatically runs N concurrent trials, where N is the number of CPUs (cores) on your machine.
 
 .. code-block:: python
@@ -42,7 +38,13 @@ You can override this parallelism with ``resources_per_trial``:
     # Fractional values are also supported, (i.e., {"cpu": 0.5}).
     tune.run(trainable, num_samples=10, resources_per_trial={"cpu": 0.5})
 
-To leverage GPUs, you must set ``gpu`` in ``resources_per_trial``. This will automatically set ``CUDA_VISIBLE_DEVICES`` for each trial.
+
+Tune will allocate the specified GPU and CPU from ``resources_per_trial`` to each individual trial. A trial will not be scheduled unless at least that amount of resources is available, preventing the cluster from being overloaded.
+
+Using GPUs
+~~~~~~~~~~
+
+To leverage GPUs, you must set ``gpu`` in ``tune.run(resources_per_trial={})``. This will automatically set ``CUDA_VISIBLE_DEVICES`` for each trial.
 
 .. code-block:: python
 
@@ -56,6 +58,35 @@ You can find an example of this in the :doc:`Keras MNIST example </tune/examples
 
 .. warning:: If 'gpu' is not set, ``CUDA_VISIBLE_DEVICES`` environment variable will be set as empty, disallowing GPU access.
 
+**Troubleshooting**: Occasionally, you may run into GPU memory issues when running a new trial. This may be
+due to the previous trial not cleaning up its GPU state fast enough. To avoid this,
+you can use ``tune.utils.wait_for_gpu`` - see :ref:`docstring <tune-util-ref>`.
+
+
+Concurrent samples
+~~~~~~~~~~~~~~~~~~
+
+If using a :ref:`search algorithm <tune-search-alg>`, you may want to limit the number of trials that are being evaluated. For example, you may want to serialize the evaluation of trials to do sequential optimization.
+
+In this case, ``ray.tune.suggest.ConcurrencyLimiter`` to limit the amount of concurrency:
+
+.. code-block:: python
+
+    algo = BayesOptSearch(utility_kwargs={
+        "kind": "ucb",
+        "kappa": 2.5,
+        "xi": 0.0
+    })
+    algo = ConcurrencyLimiter(algo, max_concurrent=4)
+    scheduler = AsyncHyperBandScheduler()
+
+See :ref:`limiter` for more details.
+
+
+
+Distributed Tuning
+~~~~~~~~~~~~~~~~~~
+
 To attach to a Ray cluster, simply run ``ray.init`` before ``tune.run``. See :ref:`start-ray-cli` for more information about ``ray.init``:
 
 .. code-block:: python
@@ -64,7 +95,7 @@ To attach to a Ray cluster, simply run ``ray.init`` before ``tune.run``. See :re
     ray.init(address=<ray_address>)
     tune.run(trainable, num_samples=100, resources_per_trial={"cpu": 2, "gpu": 1})
 
-
+Read more in the Tune :ref:`distributed experiments guide <tune-distributed>`.
 
 .. _tune-default-search-space:
 
@@ -106,37 +137,51 @@ By default, each random variable and grid search point is sampled once. To take
 
 Note that search spaces may not be interoperable across different search algorithms. For example, for many search algorithms, you will not be able to use a ``grid_search`` parameter. Read about this in the :ref:`Search Space API <tune-search-space>` page.
 
-Reporting Metrics
------------------
+.. _tune-autofilled-metrics:
+
+Auto-filled Metrics
+-------------------
 
 You can log arbitrary values and metrics in both training APIs:
 
 .. code-block:: python
 
     def trainable(config):
-        num_epochs = 100
         for i in range(num_epochs):
-            accuracy = model.train()
-            metric_1 = f(model)
-            metric_2 = model.get_loss()
+            ...
             tune.report(acc=accuracy, metric_foo=random_metric_1, bar=metric_2)
 
     class Trainable(tune.Trainable):
-        ...
-
-        def step(self):  # this is called iteratively
-            accuracy = self.model.train()
-            metric_1 = f(self.model)
-            metric_2 = self.model.get_loss()
+        def step(self):
+            ...
             # don't call report here!
             return dict(acc=accuracy, metric_foo=random_metric_1, bar=metric_2)
 
 During training, Tune will automatically log the below metrics in addition to the user-provided values. All of these can be used as stopping conditions or passed as a parameter to Trial Schedulers/Search Algorithms.
 
-.. literalinclude:: ../../../python/ray/tune/result.py
-   :language: python
-   :start-after: __sphinx_doc_begin__
-   :end-before: __sphinx_doc_end__
+* ``config``: The hyperparameter configuration
+* ``date``: String-formatted date and time when the result was processed
+* ``done``: True if the trial has been finished, False otherwise
+* ``episodes_total``: Total number of episodes (for RLLib trainables)
+* ``experiment_id``: Unique experiment ID
+* ``experiment_tag``: Unique experiment tag (includes parameter values)
+* ``hostname``: Hostname of the worker
+* ``iterations_since_restore``: The number of times ``tune.report()/trainable.train()`` has been
+  called after restoring the worker from a checkpoint
+* ``node_ip``: Host IP of the worker
+* ``pid``: Process ID (PID) of the worker process
+* ``time_since_restore``: Time in seconds since restoring from a checkpoint.
+* ``time_this_iter_s``: Runtime of the current training iteration in seconds (i.e.
+  one call to the trainable function or to ``_train()`` in the class API.
+* ``time_total_s``: Total runtime in seconds.
+* ``timestamp``: Timestamp when the result was processed
+* ``timesteps_since_restore``: Number of timesteps since restoring from a checkpoint
+* ``timesteps_total``: Total number of timesteps
+* ``training_iteration``: The number of times ``tune.report()`` has been
+  called
+* ``trial_id``: Unique trial ID
+
+All of these metrics can be seen in the ``Trial.last_result`` dictionary.
 
 .. _tune-checkpoint:
 
@@ -378,14 +423,7 @@ If using TF2, Tune also automatically generates TensorBoard HParams output, as s
 Console Output
 --------------
 
-The following fields will automatically show up on the console output, if provided:
-
-1. ``episode_reward_mean``
-2. ``mean_loss``
-3. ``mean_accuracy``
-4. ``timesteps_this_iter`` (aggregated into ``timesteps_total``).
-
-Below is an example of the console output:
+User-provided fields will be outputted automatically on a best-effort basis. You can use a :ref:`Reporter <tune-reporter-doc>` object to customize the console output.
 
 .. code-block:: bash
 
@@ -404,7 +442,6 @@ Below is an example of the console output:
     | MyTrainable_a826b7bc | RUNNING  | 10.234.98.164:31112 | 0.729127  | 0.0748 | 0.1797 |        7.05715 |    14 |
     +----------------------+----------+---------------------+-----------+--------+--------+----------------+-------+
 
-You can use a :ref:`Reporter <tune-reporter-doc>` object to customize the console output.
 
 
 Uploading Results
diff --git a/python/ray/tune/integration/torch.py b/python/ray/tune/integration/torch.py
index 147734c7e..96305d9a5 100644
--- a/python/ray/tune/integration/torch.py
+++ b/python/ray/tune/integration/torch.py
@@ -161,9 +161,9 @@ def DistributedTrainableCreator(
             to 60 seconds.
 
     Returns:
-        A trainable class object that can be passed to Tune. Resources
-            are automatically set within the object, so users do
-            not need to set `resources_per_trainable`.
+        type(Trainable): A trainable class object that can be passed
+        to Tune. Resources are automatically set within the object, so
+        users do not need to set `resources_per_trainable`.
 
     Example:
 
@@ -214,8 +214,9 @@ def distributed_checkpoint_dir(
         disable (bool): Disable for prototyping.
 
     Yields:
-        path (str): A path to a directory. This path will be used
-            again when invoking the training_function.
+        str: A path to a directory. This path will be used
+        again when invoking the training_function.
+
     Example:
 
     .. code-block:: python
diff --git a/python/ray/tune/session.py b/python/ray/tune/session.py
index 70bbbffd6..3fbecf912 100644
--- a/python/ray/tune/session.py
+++ b/python/ray/tune/session.py
@@ -111,6 +111,14 @@ def checkpoint_dir(step):
     Store any files related to restoring state within the
     provided checkpoint dir.
 
+    You should call this *before* calling ``tune.report``. The reason is
+    because we want checkpoints to be correlated with the result
+    (i.e., be able to retrieve the best checkpoint, etc). Many algorithms
+    depend on this behavior too.
+
+    Calling ``checkpoint_dir`` after report could introduce
+    inconsistencies.
+
     Args:
         step (int): Index for the checkpoint. Expected to be a
             monotonically increasing quantity.
diff --git a/python/ray/tune/utils/__init__.py b/python/ray/tune/utils/__init__.py
index fce04dc00..3210e9465 100644
--- a/python/ray/tune/utils/__init__.py
+++ b/python/ray/tune/utils/__init__.py
@@ -3,12 +3,12 @@ from ray.tune.utils.util import (
     pin_in_object_store, unflattened_lookup, UtilMonitor,
     validate_save_restore, warn_if_slow, diagnose_serialization,
     detect_checkpoint_function, detect_reporter, detect_config_single,
-    env_integer)
+    env_integer, wait_for_gpu)
 
 __all__ = [
     "deep_update", "date_str", "flatten_dict", "get_pinned_object",
     "merge_dicts", "pin_in_object_store", "unflattened_lookup", "UtilMonitor",
     "validate_save_restore", "warn_if_slow", "diagnose_serialization",
     "detect_checkpoint_function", "detect_reporter", "detect_config_single",
-    "env_integer"
+    "env_integer", "wait_for_gpu"
 ]
diff --git a/python/ray/tune/utils/util.py b/python/ray/tune/utils/util.py
index 4b051297b..45b8fca09 100644
--- a/python/ray/tune/utils/util.py
+++ b/python/ray/tune/utils/util.py
@@ -311,18 +311,19 @@ def _from_pinnable(obj):
 
 
 def diagnose_serialization(trainable):
-    """Utility for detecting accidentally-scoped objects.
+    """Utility for detecting why your trainable function isn't serializing.
 
     Args:
-        trainable (cls | func): The trainable object passed to
-            tune.run(trainable).
+        trainable (func): The trainable object passed to
+            tune.run(trainable). Currently only supports
+            Function API.
 
     Returns:
         bool | set of unserializable objects.
 
     Example:
 
-    .. code-block::
+    .. code-block:: python
 
         import threading
         # this is not serializable
@@ -396,6 +397,59 @@ def diagnose_serialization(trainable):
         return failure_set
 
 
+def wait_for_gpu(gpu_id=None, gpu_memory_limit=0.1, retry=20):
+    """Checks if a given GPU has freed memory.
+
+    Requires ``gputil`` to be installed: ``pip install gputil``.
+
+    Args:
+        gpu_id (Optional[str]): GPU id to check. Must be found
+            within GPUtil.getGPUs(). If none, resorts to
+            the first item returned from `ray.get_gpu_ids()`.
+        gpu_memory_limit (float): If memory usage is below
+            this quantity, the check will break.
+        retry (int): Number of times to check GPU limit. Sleeps 5
+            seconds between checks.
+
+    Returns:
+        bool
+            True if free.
+
+    Raises:
+        RuntimeError
+            If GPUtil is not found, if no GPUs are detected
+            or if the check fails.
+
+    Example:
+
+    .. code-block:: python
+
+        def tune_func(config):
+            tune.util.wait_for_gpu()
+            train()
+
+        tune.run(tune_func, resources_per_trial={"GPU": 1}, num_samples=10)
+    """
+    if GPUtil is None:
+        raise RuntimeError(
+            "GPUtil must be installed if calling `wait_for_gpu`.")
+    if not gpu_id:
+        gpu_id_list = ray.get_gpu_ids()
+        if not gpu_id_list:
+            raise RuntimeError(f"No GPU ids found from {ray.get_gpu_ids()}. "
+                               "Did you set Tune resources correctly?")
+        gpu_id = gpu_id_list[0]
+    gpu_object = GPUtil.getGPUs()[gpu_id]
+    for i in range(int(retry)):
+        if gpu_object.memoryUsed > gpu_memory_limit:
+            logger.info(f"Waiting for GPU {gpu_id} memory to free. "
+                        f"Mem: {gpu_object.memoryUsed:0.3f}")
+            time.sleep(5)
+        else:
+            return True
+    raise RuntimeError("GPU memory was not freed.")
+
+
 def validate_save_restore(trainable_cls,
                           config=None,
                           num_gpus=0,