diff --git a/doc/BUILD b/doc/BUILD index ee177b3c5..de5d1e2eb 100644 --- a/doc/BUILD +++ b/doc/BUILD @@ -24,6 +24,14 @@ py_test( tags = ["exclusive"] ) +py_test( + name = "progress_bar", + size = "small", + srcs = ["examples/progress_bar.py"], + tags = ["exclusive"] +) + + # Directory: examples/doc_code py_test( name = "doc_code_tf_example", diff --git a/doc/examples/overview.rst b/doc/examples/overview.rst index 30b1b27d5..d23bdcadd 100644 --- a/doc/examples/overview.rst +++ b/doc/examples/overview.rst @@ -3,61 +3,115 @@ Ray Tutorials and Examples Get started with Ray, Tune, and RLlib with these notebooks that you can run online in Colab or Binder: `Ray Tutorial Notebooks `__ + +Ray Examples +------------ + +.. raw:: html + +
+ .. toctree:: :hidden: tips-for-first-time.rst - plot_parameter_server.rst - plot_example-a3c.rst - plot_hyperparameter.rst - plot_pong_example.rst - plot_lbfgs.rst - plot_newsreader.rst - plot_streaming.rst - plot_example-lm.rst testing-tips.rst + progress_bar.rst + plot_streaming.rst .. customgalleryitem:: :tooltip: Tips for first time users. :figure: /images/pipeline.png :description: :doc:`/auto_examples/tips-for-first-time` +.. customgalleryitem:: + :tooltip: Tips for testing Ray applications + :description: :doc:`/auto_examples/testing-tips` + +.. customgalleryitem:: + :tooltip: Progress Bar for Ray Tasks + :description: :doc:`/auto_examples/progress_bar` + +.. customgalleryitem:: + :tooltip: Implement a simple streaming application using Ray’s actors. + :description: :doc:`/auto_examples/plot_streaming` + +.. raw:: html + +
+ + +Machine Learning Examples +------------------------- + +.. raw:: html + +
+ +.. toctree:: + :hidden: + + plot_parameter_server.rst + plot_hyperparameter.rst + plot_lbfgs.rst + plot_example-lm.rst + plot_newsreader.rst + + .. customgalleryitem:: :tooltip: Build a simple parameter server using Ray. :figure: /images/param_actor.png :description: :doc:`/auto_examples/plot_parameter_server` -.. customgalleryitem:: - :tooltip: Asynchronous Advantage Actor Critic agent using Ray. - :figure: /images/a3c.png - :description: :doc:`/auto_examples/plot_example-a3c` - .. customgalleryitem:: :tooltip: Simple parallel asynchronous hyperparameter evaluation. :figure: /images/hyperparameter.png :description: :doc:`/auto_examples/plot_hyperparameter` -.. customgalleryitem:: - :tooltip: Parallelizing a policy gradient calculation on OpenAI Gym Pong. - :figure: /images/pong.png - :description: :doc:`/auto_examples/plot_pong_example` - .. customgalleryitem:: :tooltip: Walkthrough of parallelizing the L-BFGS algorithm. :description: :doc:`/auto_examples/plot_lbfgs` -.. customgalleryitem:: - :tooltip: Implementing a simple news reader using Ray. - :description: :doc:`/auto_examples/plot_newsreader` - -.. customgalleryitem:: - :tooltip: Implement a simple streaming application using Ray’s actors. - :description: :doc:`/auto_examples/plot_streaming` .. customgalleryitem:: :tooltip: Distributed Fault-Tolerant BERT training for FAIRSeq using Ray. :description: :doc:`/auto_examples/plot_example-lm` .. customgalleryitem:: - :tooltip: Tips for testing Ray applications - :description: :doc:`/auto_examples/testing-tips` + :tooltip: Implementing a simple news reader using Ray. + :description: :doc:`/auto_examples/plot_newsreader` + + +.. raw:: html + +
+ + +Reinforcement Learning Examples +------------------------------- + +These are simple examples that show you how to leverage Ray Core. For Ray's production-grade reinforcement learning library, see `RLlib `__. + +.. raw:: html + +
+ +.. toctree:: + :hidden: + + plot_pong_example.rst + plot_example-a3c.rst + +.. customgalleryitem:: + :tooltip: Asynchronous Advantage Actor Critic agent using Ray. + :figure: /images/a3c.png + :description: :doc:`/auto_examples/plot_example-a3c` + +.. customgalleryitem:: + :tooltip: Parallelizing a policy gradient calculation on OpenAI Gym Pong. + :figure: /images/pong.png + :description: :doc:`/auto_examples/plot_pong_example` + +.. raw:: html + +
diff --git a/doc/examples/progress_bar.py b/doc/examples/progress_bar.py new file mode 100644 index 000000000..a3032db30 --- /dev/null +++ b/doc/examples/progress_bar.py @@ -0,0 +1,158 @@ +""" +Progress Bar for Ray Actors (tqdm) +================================== + +Tracking progress of distributed tasks can be tricky. + +This script will demonstrate how to implement a simple +progress bar for a Ray actor to track progress across various +different distributed components. + +Setup: Dependencies +------------------- + +First, import some dependencies. +""" + +# Inspiration: https://github.com/honnibal/spacy-ray/pull/ +# 1/files#diff-7ede881ddc3e8456b320afb958362b2aR12-R45 +from asyncio import Event +from typing import Tuple +from time import sleep + +import ray +# For typing purposes +from ray.actor import ActorHandle +from tqdm import tqdm + +############################################################ +# This is the Ray "actor" that can be called from anywhere to update +# our progress. You'll be using the `update` method. Don't +# instantiate this class yourself. Instead, +# it's something that you'll get from a `ProgressBar`. + + +@ray.remote +class ProgressBarActor: + counter: int + delta: int + event: Event + + def __init__(self) -> None: + self.counter = 0 + self.delta = 0 + self.event = Event() + + def update(self, num_items_completed: int) -> None: + """Updates the ProgressBar with the incremental + number of items that were just completed. + """ + self.counter += num_items_completed + self.delta += num_items_completed + self.event.set() + + async def wait_for_update(self) -> Tuple[int, int]: + """Blocking call. + + Waits until somebody calls `update`, then returns a tuple of + the number of updates since the last call to + `wait_for_update`, and the total number of completed items. + """ + await self.event.wait() + self.event.clear() + saved_delta = self.delta + self.delta = 0 + return saved_delta, self.counter + + def get_counter(self) -> int: + """ + Returns the total number of complete items. + """ + return self.counter + + +###################################################################### +# This is where the progress bar starts. You create one of these +# on the head node, passing in the expected total number of items, +# and an optional string description. +# Pass along the `actor` reference to any remote task, +# and if they complete ten +# tasks, they'll call `actor.update.remote(10)`. + +# Back on the local node, once you launch your remote Ray tasks, call +# `print_until_done`, which will feed everything back into a `tqdm` counter. + + +class ProgressBar: + progress_actor: ActorHandle + total: int + description: str + pbar: tqdm + + def __init__(self, total: int, description: str = ""): + # Ray actors don't seem to play nice with mypy, generating + # a spurious warning for the following line, + # which we need to suppress. The code is fine. + self.progress_actor = ProgressBarActor.remote() # type: ignore + self.total = total + self.description = description + + @property + def actor(self) -> ActorHandle: + """Returns a reference to the remote `ProgressBarActor`. + + When you complete tasks, call `update` on the actor. + """ + return self.progress_actor + + def print_until_done(self) -> None: + """Blocking call. + + Do this after starting a series of remote Ray tasks, to which you've + passed the actor handle. Each of them calls `update` on the actor. + When the progress meter reaches 100%, this method returns. + """ + pbar = tqdm(desc=self.description, total=self.total) + while True: + delta, counter = ray.get(self.actor.wait_for_update.remote()) + pbar.update(delta) + if counter >= self.total: + pbar.close() + return + + +################################################################# +# This is an example of a task that increments the progress bar. +# Note that this is a Ray Task, but it could very well +# be any generic Ray Actor. +# +@ray.remote +def sleep_then_increment(i: int, pba: ActorHandle) -> int: + sleep(i / 2.0) + pba.update.remote(1) + return i + + +################################################################# +# Now you can run it and see what happens! +# + + +def run(): + ray.init() + num_ticks = 6 + pb = ProgressBar(num_ticks) + actor = pb.actor + # You can replace this with any arbitrary Ray task/actor. + tasks_pre_launch = [ + sleep_then_increment.remote(i, actor) for i in range(0, num_ticks) + ] + + pb.print_until_done() + tasks = ray.get(tasks_pre_launch) + + tasks == list(range(num_ticks)) + num_ticks == ray.get(actor.get_counter.remote()) + + +run() diff --git a/doc/source/cluster/slurm.rst b/doc/source/cluster/slurm.rst index 1606489e0..718340c5f 100644 --- a/doc/source/cluster/slurm.rst +++ b/doc/source/cluster/slurm.rst @@ -3,7 +3,26 @@ Deploying on Slurm ================== -Clusters managed by Slurm may require that Ray is initialized as a part of the submitted job. This can be done by using ``srun`` within the submitted script. For example: +Clusters managed by Slurm may require that Ray is initialized as a part of the submitted job. This can be done by using ``srun`` within the submitted script. + +Examples and templates +---------------------- + +Here are some community-contributed templates for using SLURM with Ray: + +- `Ray sbatch submission scripts`_ used at `NERSC `_, a US national lab. +- `YASPI`_ (yet another slurm python interface) by @albanie. The goal of yaspi is to provide an interface to submitting slurm jobs, thereby obviating the joys of sbatch files. It does so through recipes - these are collections of templates and rules for generating sbatch scripts. Supports job submissions for Ray. + +- `Template script`_ by @pengzhenghao + +.. _`Ray sbatch submission scripts`: https://github.com/NERSC/slurm-ray-cluster + +.. _`YASPI`: https://github.com/albanie/yaspi + +.. _`Template script`: https://gist.github.com/pengzhenghao/b348db1075101a9b986c4cdfea13dcd6 + +Starter SLURM script +-------------------- .. code-block:: bash