#!/usr/bin/env python """Example of using PBT with RLlib. Note that this requires a cluster with at least 8 GPUs in order for all trials to run concurrently, otherwise PBT will round-robin train the trials which is less efficient (or you can set {"gpu": 0} to use CPUs for SGD instead). """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import random import ray from ray.tune import run_experiments from ray.tune.pbt import PopulationBasedTraining if __name__ == "__main__": # Postprocess the perturbed config to ensure it's still valid def explore(config): # ensure we collect enough timesteps to do sgd if config["timesteps_per_batch"] < config["sgd_batchsize"] * 2: config["timesteps_per_batch"] = config["sgd_batchsize"] * 2 # ensure we run at least one sgd iter if config["num_sgd_iter"] < 1: config["num_sgd_iter"] = 1 return config pbt = PopulationBasedTraining( time_attr="time_total_s", reward_attr="episode_reward_mean", perturbation_interval=120, resample_probability=0.25, # Specifies the mutations of these hyperparams hyperparam_mutations={ "lambda": lambda: random.uniform(0.9, 1.0), "clip_param": lambda: random.uniform(0.01, 0.5), "sgd_stepsize": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5], "num_sgd_iter": lambda: random.randint(1, 30), "sgd_batchsize": lambda: random.randint(128, 16384), "timesteps_per_batch": lambda: random.randint(2000, 160000), }, custom_explore_fn=explore) ray.init() run_experiments( { "pbt_humanoid_test": { "run": "PPO", "env": "Humanoid-v1", "repeat": 8, "config": { "kl_coeff": 1.0, "num_workers": 8, "devices": ["/gpu:0"], "model": { "free_log_std": True }, # These params are tuned from a fixed starting value. "lambda": 0.95, "clip_param": 0.2, "sgd_stepsize": 1e-4, # These params start off randomly drawn from a set. "num_sgd_iter": lambda spec: random.choice([10, 20, 30]), "sgd_batchsize": lambda spec: random.choice([128, 512, 2048]), "timesteps_per_batch": lambda spec: random.choice([10000, 20000, 40000]) }, }, }, scheduler=pbt)