mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 19:16:19 +08:00
[docs] slurm + progress_bar example (#10782)
This commit is contained in:
committed by
Barak Michener
parent
9c65373085
commit
decaa6dea0
@@ -24,6 +24,14 @@ py_test(
|
||||
tags = ["exclusive"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "progress_bar",
|
||||
size = "small",
|
||||
srcs = ["examples/progress_bar.py"],
|
||||
tags = ["exclusive"]
|
||||
)
|
||||
|
||||
|
||||
# Directory: examples/doc_code
|
||||
py_test(
|
||||
name = "doc_code_tf_example",
|
||||
|
||||
+81
-27
@@ -3,61 +3,115 @@ Ray Tutorials and Examples
|
||||
|
||||
Get started with Ray, Tune, and RLlib with these notebooks that you can run online in Colab or Binder: `Ray Tutorial Notebooks <https://github.com/ray-project/tutorial>`__
|
||||
|
||||
|
||||
Ray Examples
|
||||
------------
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div class="sphx-glr-bigcontainer">
|
||||
|
||||
.. toctree::
|
||||
:hidden:
|
||||
|
||||
tips-for-first-time.rst
|
||||
plot_parameter_server.rst
|
||||
plot_example-a3c.rst
|
||||
plot_hyperparameter.rst
|
||||
plot_pong_example.rst
|
||||
plot_lbfgs.rst
|
||||
plot_newsreader.rst
|
||||
plot_streaming.rst
|
||||
plot_example-lm.rst
|
||||
testing-tips.rst
|
||||
progress_bar.rst
|
||||
plot_streaming.rst
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Tips for first time users.
|
||||
:figure: /images/pipeline.png
|
||||
:description: :doc:`/auto_examples/tips-for-first-time`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Tips for testing Ray applications
|
||||
:description: :doc:`/auto_examples/testing-tips`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Progress Bar for Ray Tasks
|
||||
:description: :doc:`/auto_examples/progress_bar`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Implement a simple streaming application using Ray’s actors.
|
||||
:description: :doc:`/auto_examples/plot_streaming`
|
||||
|
||||
.. raw:: html
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
Machine Learning Examples
|
||||
-------------------------
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div class="sphx-glr-bigcontainer">
|
||||
|
||||
.. toctree::
|
||||
:hidden:
|
||||
|
||||
plot_parameter_server.rst
|
||||
plot_hyperparameter.rst
|
||||
plot_lbfgs.rst
|
||||
plot_example-lm.rst
|
||||
plot_newsreader.rst
|
||||
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Build a simple parameter server using Ray.
|
||||
:figure: /images/param_actor.png
|
||||
:description: :doc:`/auto_examples/plot_parameter_server`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Asynchronous Advantage Actor Critic agent using Ray.
|
||||
:figure: /images/a3c.png
|
||||
:description: :doc:`/auto_examples/plot_example-a3c`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Simple parallel asynchronous hyperparameter evaluation.
|
||||
:figure: /images/hyperparameter.png
|
||||
:description: :doc:`/auto_examples/plot_hyperparameter`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Parallelizing a policy gradient calculation on OpenAI Gym Pong.
|
||||
:figure: /images/pong.png
|
||||
:description: :doc:`/auto_examples/plot_pong_example`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Walkthrough of parallelizing the L-BFGS algorithm.
|
||||
:description: :doc:`/auto_examples/plot_lbfgs`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Implementing a simple news reader using Ray.
|
||||
:description: :doc:`/auto_examples/plot_newsreader`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Implement a simple streaming application using Ray’s actors.
|
||||
:description: :doc:`/auto_examples/plot_streaming`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Distributed Fault-Tolerant BERT training for FAIRSeq using Ray.
|
||||
:description: :doc:`/auto_examples/plot_example-lm`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Tips for testing Ray applications
|
||||
:description: :doc:`/auto_examples/testing-tips`
|
||||
:tooltip: Implementing a simple news reader using Ray.
|
||||
:description: :doc:`/auto_examples/plot_newsreader`
|
||||
|
||||
|
||||
.. raw:: html
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
Reinforcement Learning Examples
|
||||
-------------------------------
|
||||
|
||||
These are simple examples that show you how to leverage Ray Core. For Ray's production-grade reinforcement learning library, see `RLlib <http://docs.ray.io/en/latest/rllib.html>`__.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div class="sphx-glr-bigcontainer">
|
||||
|
||||
.. toctree::
|
||||
:hidden:
|
||||
|
||||
plot_pong_example.rst
|
||||
plot_example-a3c.rst
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Asynchronous Advantage Actor Critic agent using Ray.
|
||||
:figure: /images/a3c.png
|
||||
:description: :doc:`/auto_examples/plot_example-a3c`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Parallelizing a policy gradient calculation on OpenAI Gym Pong.
|
||||
:figure: /images/pong.png
|
||||
:description: :doc:`/auto_examples/plot_pong_example`
|
||||
|
||||
.. raw:: html
|
||||
|
||||
</div>
|
||||
|
||||
@@ -0,0 +1,158 @@
|
||||
"""
|
||||
Progress Bar for Ray Actors (tqdm)
|
||||
==================================
|
||||
|
||||
Tracking progress of distributed tasks can be tricky.
|
||||
|
||||
This script will demonstrate how to implement a simple
|
||||
progress bar for a Ray actor to track progress across various
|
||||
different distributed components.
|
||||
|
||||
Setup: Dependencies
|
||||
-------------------
|
||||
|
||||
First, import some dependencies.
|
||||
"""
|
||||
|
||||
# Inspiration: https://github.com/honnibal/spacy-ray/pull/
|
||||
# 1/files#diff-7ede881ddc3e8456b320afb958362b2aR12-R45
|
||||
from asyncio import Event
|
||||
from typing import Tuple
|
||||
from time import sleep
|
||||
|
||||
import ray
|
||||
# For typing purposes
|
||||
from ray.actor import ActorHandle
|
||||
from tqdm import tqdm
|
||||
|
||||
############################################################
|
||||
# This is the Ray "actor" that can be called from anywhere to update
|
||||
# our progress. You'll be using the `update` method. Don't
|
||||
# instantiate this class yourself. Instead,
|
||||
# it's something that you'll get from a `ProgressBar`.
|
||||
|
||||
|
||||
@ray.remote
|
||||
class ProgressBarActor:
|
||||
counter: int
|
||||
delta: int
|
||||
event: Event
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.counter = 0
|
||||
self.delta = 0
|
||||
self.event = Event()
|
||||
|
||||
def update(self, num_items_completed: int) -> None:
|
||||
"""Updates the ProgressBar with the incremental
|
||||
number of items that were just completed.
|
||||
"""
|
||||
self.counter += num_items_completed
|
||||
self.delta += num_items_completed
|
||||
self.event.set()
|
||||
|
||||
async def wait_for_update(self) -> Tuple[int, int]:
|
||||
"""Blocking call.
|
||||
|
||||
Waits until somebody calls `update`, then returns a tuple of
|
||||
the number of updates since the last call to
|
||||
`wait_for_update`, and the total number of completed items.
|
||||
"""
|
||||
await self.event.wait()
|
||||
self.event.clear()
|
||||
saved_delta = self.delta
|
||||
self.delta = 0
|
||||
return saved_delta, self.counter
|
||||
|
||||
def get_counter(self) -> int:
|
||||
"""
|
||||
Returns the total number of complete items.
|
||||
"""
|
||||
return self.counter
|
||||
|
||||
|
||||
######################################################################
|
||||
# This is where the progress bar starts. You create one of these
|
||||
# on the head node, passing in the expected total number of items,
|
||||
# and an optional string description.
|
||||
# Pass along the `actor` reference to any remote task,
|
||||
# and if they complete ten
|
||||
# tasks, they'll call `actor.update.remote(10)`.
|
||||
|
||||
# Back on the local node, once you launch your remote Ray tasks, call
|
||||
# `print_until_done`, which will feed everything back into a `tqdm` counter.
|
||||
|
||||
|
||||
class ProgressBar:
|
||||
progress_actor: ActorHandle
|
||||
total: int
|
||||
description: str
|
||||
pbar: tqdm
|
||||
|
||||
def __init__(self, total: int, description: str = ""):
|
||||
# Ray actors don't seem to play nice with mypy, generating
|
||||
# a spurious warning for the following line,
|
||||
# which we need to suppress. The code is fine.
|
||||
self.progress_actor = ProgressBarActor.remote() # type: ignore
|
||||
self.total = total
|
||||
self.description = description
|
||||
|
||||
@property
|
||||
def actor(self) -> ActorHandle:
|
||||
"""Returns a reference to the remote `ProgressBarActor`.
|
||||
|
||||
When you complete tasks, call `update` on the actor.
|
||||
"""
|
||||
return self.progress_actor
|
||||
|
||||
def print_until_done(self) -> None:
|
||||
"""Blocking call.
|
||||
|
||||
Do this after starting a series of remote Ray tasks, to which you've
|
||||
passed the actor handle. Each of them calls `update` on the actor.
|
||||
When the progress meter reaches 100%, this method returns.
|
||||
"""
|
||||
pbar = tqdm(desc=self.description, total=self.total)
|
||||
while True:
|
||||
delta, counter = ray.get(self.actor.wait_for_update.remote())
|
||||
pbar.update(delta)
|
||||
if counter >= self.total:
|
||||
pbar.close()
|
||||
return
|
||||
|
||||
|
||||
#################################################################
|
||||
# This is an example of a task that increments the progress bar.
|
||||
# Note that this is a Ray Task, but it could very well
|
||||
# be any generic Ray Actor.
|
||||
#
|
||||
@ray.remote
|
||||
def sleep_then_increment(i: int, pba: ActorHandle) -> int:
|
||||
sleep(i / 2.0)
|
||||
pba.update.remote(1)
|
||||
return i
|
||||
|
||||
|
||||
#################################################################
|
||||
# Now you can run it and see what happens!
|
||||
#
|
||||
|
||||
|
||||
def run():
|
||||
ray.init()
|
||||
num_ticks = 6
|
||||
pb = ProgressBar(num_ticks)
|
||||
actor = pb.actor
|
||||
# You can replace this with any arbitrary Ray task/actor.
|
||||
tasks_pre_launch = [
|
||||
sleep_then_increment.remote(i, actor) for i in range(0, num_ticks)
|
||||
]
|
||||
|
||||
pb.print_until_done()
|
||||
tasks = ray.get(tasks_pre_launch)
|
||||
|
||||
tasks == list(range(num_ticks))
|
||||
num_ticks == ray.get(actor.get_counter.remote())
|
||||
|
||||
|
||||
run()
|
||||
@@ -3,7 +3,26 @@
|
||||
Deploying on Slurm
|
||||
==================
|
||||
|
||||
Clusters managed by Slurm may require that Ray is initialized as a part of the submitted job. This can be done by using ``srun`` within the submitted script. For example:
|
||||
Clusters managed by Slurm may require that Ray is initialized as a part of the submitted job. This can be done by using ``srun`` within the submitted script.
|
||||
|
||||
Examples and templates
|
||||
----------------------
|
||||
|
||||
Here are some community-contributed templates for using SLURM with Ray:
|
||||
|
||||
- `Ray sbatch submission scripts`_ used at `NERSC <https://www.nersc.gov/>`_, a US national lab.
|
||||
- `YASPI`_ (yet another slurm python interface) by @albanie. The goal of yaspi is to provide an interface to submitting slurm jobs, thereby obviating the joys of sbatch files. It does so through recipes - these are collections of templates and rules for generating sbatch scripts. Supports job submissions for Ray.
|
||||
|
||||
- `Template script`_ by @pengzhenghao
|
||||
|
||||
.. _`Ray sbatch submission scripts`: https://github.com/NERSC/slurm-ray-cluster
|
||||
|
||||
.. _`YASPI`: https://github.com/albanie/yaspi
|
||||
|
||||
.. _`Template script`: https://gist.github.com/pengzhenghao/b348db1075101a9b986c4cdfea13dcd6
|
||||
|
||||
Starter SLURM script
|
||||
--------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
|
||||
Reference in New Issue
Block a user