From 7d28bbbdbb058d186db4fa09f10c890cc8884f8b Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sat, 24 Aug 2019 20:37:45 -0700 Subject: [PATCH] [rllib] Document on traj postprocess (#5532) * document on traj postprocess * shorten it --- .github/PULL_REQUEST_TEMPLATE.md | 6 +----- doc/source/rllib-training.rst | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index d6d04c553..a29de5acd 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -2,11 +2,7 @@ ## Why are these changes needed? - - -## What do these changes do? - - + ## Related issue number diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst index 0d0a359fc..889d7bf59 100644 --- a/doc/source/rllib-training.rst +++ b/doc/source/rllib-training.rst @@ -259,6 +259,11 @@ You can provide callback functions to be called at points during policy evaluati print("trainer.train() result: {} -> {} episodes".format( info["trainer"].__name__, info["result"]["episodes_this_iter"])) + def on_postprocess_traj(info): + episode = info["episode"] + batch = info["post_batch"] # note: you can mutate this + print("postprocessed {} steps".format(batch.count)) + ray.init() analysis = tune.run( "PG", @@ -269,14 +274,25 @@ You can provide callback functions to be called at points during policy evaluati "on_episode_step": tune.function(on_episode_step), "on_episode_end": tune.function(on_episode_end), "on_train_result": tune.function(on_train_result), + "on_postprocess_traj": tune.function(on_postprocess_traj), }, }, ) +Visualizing Custom Metrics +~~~~~~~~~~~~~~~~~~~~~~~~~~ + Custom metrics can be accessed and visualized like any other training result: .. image:: custom_metric.png +Rewriting Trajectories +~~~~~~~~~~~~~~~~~~~~~~ + +Note that in the ``on_postprocess_batch`` callback you have full access to the trajectory batch (``post_batch``) and other training state. This can be used to rewrite the trajectory, which has a number of uses including: + * Backdating rewards to previous time steps (e.g., based on values in ``info``). + * Adding model-based curiosity bonuses to rewards (you can train the model with a `custom model supervised loss `__). + Example: Curriculum Learning ~~~~~~~~~~~~~~~~~~~~~~~~~~~~