[rllib] Initial work on integrating hyperparameter search tool (#1107)

* clean up train

* update

* update train script

* add tuned examples

* add agent catalog

* add tune lib

* update

* fix

* testS

* remove

* train docs

* comments

* todo

* fix resource parsing

* fix cr test

* add test

* try to fix travis test
This commit is contained in:
Eric Liang
2017-10-13 16:18:16 -07:00
committed by GitHub
parent 486cb64e3f
commit 79ea205b3e
25 changed files with 1075 additions and 207 deletions
+3 -3
View File
@@ -11,7 +11,7 @@ import os
import ray
from ray.rllib.a3c.runner import RunnerThread, process_rollout
from ray.rllib.a3c.envs import create_and_wrap
from ray.rllib.common import Agent, TrainingResult, get_tensorflow_log_dir
from ray.rllib.common import Agent, TrainingResult
from ray.rllib.a3c.shared_model import SharedModel
from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM
@@ -73,9 +73,8 @@ class Runner(object):
return completed
def start(self):
logdir = get_tensorflow_log_dir(self.logdir)
summary_writer = tf.summary.FileWriter(
os.path.join(logdir, "agent_%d" % self.id))
os.path.join(self.logdir, "agent_%d" % self.id))
self.summary_writer = summary_writer
self.runner.start_runner(self.policy.sess, summary_writer)
@@ -96,6 +95,7 @@ class Runner(object):
class A3CAgent(Agent):
_agent_name = "A3C"
_default_config = DEFAULT_CONFIG
def _init(self):
self.env = create_and_wrap(self.env_creator, self.config["model"])
+43
View File
@@ -0,0 +1,43 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.common import Agent, TrainingResult
class _MockAgent(Agent):
"""Mock agent for use in tests"""
_agent_name = "MockAgent"
_default_config = {}
def _init(self):
pass
def _train(self):
return TrainingResult(
episode_reward_mean=10, episode_len_mean=10,
timesteps_this_iter=10, info={})
def get_agent_class(alg):
"""Returns the class of an known agent given its name."""
if alg == "PPO":
from ray.rllib import ppo
return ppo.PPOAgent
elif alg == "ES":
from ray.rllib import es
return es.ESAgent
elif alg == "DQN":
from ray.rllib import dqn
return dqn.DQNAgent
elif alg == "A3C":
from ray.rllib import a3c
return a3c.A3CAgent
elif alg == "__fake":
return _MockAgent
else:
raise Exception(
("Unknown algorithm {}, check --alg argument. Valid choices " +
"are PPO, ES, DQN, and A3C.").format(alg))
+135 -81
View File
@@ -1,3 +1,7 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
from datetime import datetime
@@ -11,8 +15,6 @@ import tempfile
import time
import uuid
import gym
import smart_open
import tensorflow as tf
if sys.version_info[0] == 2:
@@ -24,64 +26,6 @@ logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
def get_tensorflow_log_dir(logdir):
if logdir.startswith("s3"):
print("WARNING: TensorFlow logging to S3 not supported by"
"TensorFlow, logging to /tmp/ray/ instead")
logdir = "/tmp/ray/"
if not os.path.exists(logdir):
os.makedirs(logdir)
return logdir
class RLLibEncoder(json.JSONEncoder):
def __init__(self, nan_str="null", **kwargs):
super(RLLibEncoder, self).__init__(**kwargs)
self.nan_str = nan_str
def iterencode(self, o, _one_shot=False):
if self.ensure_ascii:
_encoder = json.encoder.encode_basestring_ascii
else:
_encoder = json.encoder.encode_basestring
def floatstr(o, allow_nan=self.allow_nan, nan_str=self.nan_str):
return repr(o) if not np.isnan(o) else nan_str
_iterencode = json.encoder._make_iterencode(
None, self.default, _encoder, self.indent, floatstr,
self.key_separator, self.item_separator, self.sort_keys,
self.skipkeys, _one_shot)
return _iterencode(o, 0)
def default(self, value):
if np.isnan(value):
return None
if np.issubdtype(value, float):
return float(value)
if np.issubdtype(value, int):
return int(value)
class RLLibLogger(object):
"""Writing small amounts of data to S3 with real-time updates.
"""
def __init__(self, uri):
self.result_buffer = StringIO.StringIO()
self.uri = uri
def write(self, b):
# TODO(pcm): At the moment we are writing the whole results output from
# the beginning in each iteration. This will write O(n^2) bytes where n
# is the number of bytes printed so far. Fix this! This should at least
# only write the last 5MBs (S3 chunksize).
with smart_open.smart_open(self.uri, "w") as f:
self.result_buffer.write(b)
f.write(self.result_buffer.getvalue())
TrainingResult = namedtuple("TrainingResult", [
# Unique string identifier for this experiment. This id is preserved
# across checkpoint / restore calls.
@@ -127,49 +71,72 @@ class Agent(object):
logdir (str): Directory in which training outputs should be placed.
"""
def __init__(self, env_creator, config, upload_dir=None):
def __init__(
self, env_creator, config, local_dir='/tmp/ray',
upload_dir=None, agent_id=None):
"""Initialize an RLLib agent.
Args:
env_creator (str|func): Name of the OpenAI gym environment to train
against, or a function that creates such an env.
config (obj): Algorithm-specific configuration data.
upload_dir (str): Root directory into which the output directory
should be placed. Can be local like file:///tmp/ray/ or on S3
like s3://bucketname/.
local_dir (str): Directory where results and temporary files will
be placed.
upload_dir (str): Optional remote URI like s3://bucketname/ where
results will be uploaded.
agent_id (str): Optional unique identifier for this agent, used
to determine where to store results in the local dir.
"""
self._experiment_id = uuid.uuid4().hex
upload_dir = "file:///tmp/ray" if upload_dir is None else upload_dir
if type(env_creator) is str:
import gym
env_name = env_creator
self.env_creator = lambda: gym.make(env_name)
else:
env_name = "custom"
self.env_creator = env_creator
self.config = config
self.config.update({"experiment_id": self._experiment_id})
self.config.update({"env_name": env_name})
self.config.update({"alg": self._agent_name})
prefix = "{}_{}_{}".format(
self.config = self._default_config.copy()
for k in config.keys():
if k not in self.config:
raise Exception(
"Unknown agent config `{}`, "
"all agent configs: {}".format(k, self.config.keys()))
self.config.update(config)
self.config.update({
"agent_id": agent_id,
"alg": self._agent_name,
"env_name": env_name,
"experiment_id": self._experiment_id,
})
logdir_suffix = "{}_{}_{}".format(
env_name,
self.__class__.__name__,
datetime.today().strftime("%Y-%m-%d_%H-%M-%S"))
if upload_dir.startswith("file"):
local_dir = upload_dir[len("file://"):]
if not os.path.exists(local_dir):
os.makedirs(local_dir)
self.logdir = tempfile.mkdtemp(prefix=prefix, dir=local_dir)
agent_id or datetime.today().strftime("%Y-%m-%d_%H-%M-%S"))
if not os.path.exists(local_dir):
os.makedirs(local_dir)
self.logdir = tempfile.mkdtemp(prefix=logdir_suffix, dir=local_dir)
if upload_dir:
log_upload_uri = os.path.join(upload_dir, logdir_suffix)
else:
self.logdir = os.path.join(upload_dir, prefix)
log_upload_uri = None
# TODO(ekl) consider inlining config into the result jsons
log_path = os.path.join(self.logdir, "config.json")
with smart_open.smart_open(log_path, "w") as f:
config_out = os.path.join(self.logdir, "config.json")
with open(config_out, "w") as f:
json.dump(self.config, f, sort_keys=True, cls=RLLibEncoder)
logger.info(
"%s algorithm created with logdir '%s'",
self.__class__.__name__, self.logdir)
"%s algorithm created with logdir '%s' and upload uri '%s'",
self.__class__.__name__, self.logdir, log_upload_uri)
self._result_logger = RLLibLogger(
os.path.join(self.logdir, "result.json"),
log_upload_uri and os.path.join(log_upload_uri, "result.json"))
self._file_writer = tf.summary.FileWriter(self.logdir)
self._iteration = 0
self._time_total = 0.0
@@ -208,8 +175,29 @@ class Agent(object):
for field in result:
assert field is not None, result
self._log_result(result)
return result
def _log_result(self, result):
"""Appends the given result to this agent's log dir."""
# We need to use a custom json serializer class so that NaNs get
# encoded as null as required by Athena.
json.dump(result._asdict(), self._result_logger, cls=RLLibEncoder)
self._result_logger.write("\n")
train_stats = tf.Summary(value=[
tf.Summary.Value(
tag="rllib/time_this_iter_s",
simple_value=result.time_this_iter_s),
tf.Summary.Value(
tag="rllib/episode_reward_mean",
simple_value=result.episode_reward_mean),
tf.Summary.Value(
tag="rllib/episode_len_mean",
simple_value=result.episode_len_mean)])
self._file_writer.add_summary(train_stats, result.training_iteration)
def save(self):
"""Saves the current model state to a checkpoint.
@@ -237,6 +225,11 @@ class Agent(object):
self._timesteps_total = metadata[2]
self._time_total = metadata[3]
def stop(self):
"""Releases all resources used by this agent."""
self._file_writer.close()
def compute_action(self, observation):
"""Computes an action using the current trained policy."""
@@ -254,6 +247,12 @@ class Agent(object):
raise NotImplementedError
@property
def _default_config(self):
"""Subclasses should override this to declare their default config."""
raise NotImplementedError
def _train(self):
"""Subclasses should override this to implement train()."""
@@ -268,3 +267,58 @@ class Agent(object):
"""Subclasses should override this to implement restore()."""
raise NotImplementedError
class RLLibEncoder(json.JSONEncoder):
def __init__(self, nan_str="null", **kwargs):
super(RLLibEncoder, self).__init__(**kwargs)
self.nan_str = nan_str
def iterencode(self, o, _one_shot=False):
if self.ensure_ascii:
_encoder = json.encoder.encode_basestring_ascii
else:
_encoder = json.encoder.encode_basestring
def floatstr(o, allow_nan=self.allow_nan, nan_str=self.nan_str):
return repr(o) if not np.isnan(o) else nan_str
_iterencode = json.encoder._make_iterencode(
None, self.default, _encoder, self.indent, floatstr,
self.key_separator, self.item_separator, self.sort_keys,
self.skipkeys, _one_shot)
return _iterencode(o, 0)
def default(self, value):
if np.isnan(value):
return None
if np.issubdtype(value, float):
return float(value)
if np.issubdtype(value, int):
return int(value)
class RLLibLogger(object):
"""Writing small amounts of data to S3 with real-time updates.
"""
def __init__(self, local_file, uri=None):
self.local_out = open(local_file, "w")
self.result_buffer = StringIO.StringIO()
self.uri = uri
if self.uri:
import smart_open
self.smart_open = smart_open.smart_open
def write(self, b):
self.local_out.write(b)
self.local_out.flush()
# TODO(pcm): At the moment we are writing the whole results output from
# the beginning in each iteration. This will write O(n^2) bytes where n
# is the number of bytes printed so far. Fix this! This should at least
# only write the last 5MBs (S3 chunksize).
if self.uri:
with self.smart_open(self.uri, "w") as f:
self.result_buffer.write(b)
f.write(self.result_buffer.getvalue())
+1
View File
@@ -245,6 +245,7 @@ class RemoteActor(Actor):
class DQNAgent(Agent):
_agent_name = "DQN"
_default_config = DEFAULT_CONFIG
def _init(self):
self.actor = Actor(self.env_creator, self.config, self.logdir)
+1
View File
@@ -159,6 +159,7 @@ class Worker(object):
class ESAgent(Agent):
_agent_name = "ES"
_default_config = DEFAULT_CONFIG
def _init(self):
+3 -3
View File
@@ -11,7 +11,7 @@ import tensorflow as tf
from tensorflow.python import debug as tf_debug
import ray
from ray.rllib.common import Agent, TrainingResult, get_tensorflow_log_dir
from ray.rllib.common import Agent, TrainingResult
from ray.rllib.ppo.runner import Runner, RemoteRunner
from ray.rllib.ppo.rollout import collect_samples
from ray.rllib.ppo.utils import shuffle
@@ -82,6 +82,7 @@ DEFAULT_CONFIG = {
class PPOAgent(Agent):
_agent_name = "PPO"
_default_config = DEFAULT_CONFIG
def _init(self):
self.global_step = 0
@@ -94,9 +95,8 @@ class PPOAgent(Agent):
for _ in range(self.config["num_workers"])]
self.start_time = time.time()
if self.config["write_logs"]:
logdir = get_tensorflow_log_dir(self.logdir)
self.file_writer = tf.summary.FileWriter(
logdir, self.model.sess.graph)
self.logdir, self.model.sess.graph)
else:
self.file_writer = None
self.saver = tf.train.Saver(max_to_keep=None)
@@ -8,10 +8,7 @@ import numpy as np
import ray
import random
from ray.rllib.dqn import (DQNAgent, DEFAULT_CONFIG as DQN_CONFIG)
from ray.rllib.ppo import (PPOAgent, DEFAULT_CONFIG as PG_CONFIG)
from ray.rllib.a3c import (A3CAgent, DEFAULT_CONFIG as A3C_CONFIG)
# from ray.rllib.es import (ESAgent, DEFAULT_CONFIG as ES_CONFIG)
from ray.rllib.agents import get_agent_class
def get_mean_action(alg, obs):
@@ -22,20 +19,18 @@ def get_mean_action(alg, obs):
ray.init()
for (cls, default_config) in [
(DQNAgent, DQN_CONFIG),
(PPOAgent, PG_CONFIG),
(A3CAgent, A3C_CONFIG),
# https://github.com/ray-project/ray/issues/1062
# (ESAgent, ES_CONFIG),
]:
config = default_config.copy()
config["num_sgd_iter"] = 5
config["use_lstm"] = False # for a3c
config["episodes_per_batch"] = 100
config["timesteps_per_batch"] = 1000
alg1 = cls("CartPole-v0", config)
alg2 = cls("CartPole-v0", config)
CONFIGS = {
"DQN": {},
"PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000},
"A3C": {"use_lstm": False},
}
# https://github.com/ray-project/ray/issues/1062 for enabling ES test as well
for name in ["DQN", "PPO", "A3C"]:
cls = get_agent_class(name)
alg1 = cls("CartPole-v0", CONFIGS[name])
alg2 = cls("CartPole-v0", CONFIGS[name])
for _ in range(3):
res = alg1.train()
-17
View File
@@ -1,17 +0,0 @@
#!/bin/bash
python train.py --env Hopper-v1 --config '{"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PPO --upload-dir s3://bucketname/
python train.py --env CartPole-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}' --alg PPO --upload-dir s3://bucketname/
python train.py --env Walker2d-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}' --alg PPO --upload-dir s3://bucketname/
python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}' --alg PPO --upload-dir s3://bucketname/
python train.py --env Humanoid-v1 --config '{"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}' --alg PPO --upload-dir s3://bucketname/
python train.py --env PongNoFrameskip-v0 --alg DQN --upload-dir s3://bucketname/
python train.py --env PongDeterministic-v4 --alg A3C --config '{"num_workers": 16, "num_batches_per_iteration": 1000, "batch_size": 20}' --upload-dir s3://bucketname/
python train.py --env Humanoid-v1 --alg EvolutionStrategies --upload-dir s3://bucketname/
+47 -77
View File
@@ -1,97 +1,67 @@
#!/usr/bin/env python
"""The main command line interface to RLlib.
Arguments may either be specified on the command line or in JSON/YAML
files. Additionally, the file-based interface supports hyperparameter
exploration through grid or random search, though both interfaces allow
for the concurrent execution of multiple trials on Ray.
Single-trial example:
./train.py --alg=DQN --env=CartPole-v0
Hyperparameter grid search example:
./train.py -f tuned_examples/cartpole-grid-search-example.yaml
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import json
import os
import pprint
import sys
import yaml
import ray
import ray.rllib.ppo as ppo
import ray.rllib.es as es
import ray.rllib.dqn as dqn
import ray.rllib.a3c as a3c
from ray.tune.config_parser import make_parser, parse_to_trials
from ray.tune.trial_runner import TrialRunner
from ray.tune.trial import Trial
parser = argparse.ArgumentParser(
description=("Train a reinforcement learning agent."))
parser = make_parser("Train a reinforcement learning agent.")
# Extends the base parser defined in ray/tune/config_parser, to add some
# RLlib specific arguments. For more arguments, see the configuration
# defined there.
parser.add_argument("--redis-address", default=None, type=str,
help="The Redis address of the cluster.")
parser.add_argument("--env", required=True, type=str,
help="The gym environment to use.")
parser.add_argument("--alg", required=True, type=str,
help="The reinforcement learning algorithm to use.")
parser.add_argument("--num-iterations", default=sys.maxsize, type=int,
help="The number of training iterations to run.")
parser.add_argument("--config", default="{}", type=str,
help="The configuration options of the algorithm.")
parser.add_argument("--upload-dir", default="file:///tmp/ray", type=str,
help="Where the traces are stored.")
parser.add_argument("--checkpoint-freq", default=sys.maxsize, type=int,
help="How many iterations between checkpoints.")
parser.add_argument("--restore", default="", type=str,
help="If specified, restores state from this checkpoint.")
parser.add_argument("--restore", default=None, type=str,
help="If specified, restore from this checkpoint.")
parser.add_argument("-f", "--config-file", default=None, type=str,
help="If specified, use config options from this file.")
if __name__ == "__main__":
args = parser.parse_args()
json_config = json.loads(args.config)
runner = TrialRunner()
if args.config_file:
with open(args.config_file) as f:
config = yaml.load(f)
for trial in parse_to_trials(config):
runner.add_trial(trial)
else:
runner.add_trial(
Trial(
args.env, args.alg, args.config, args.local_dir, None,
args.resources, args.stop, args.checkpoint_freq,
args.restore, args.upload_dir))
ray.init(redis_address=args.redis_address)
def _check_and_update(config, json):
for k in json.keys():
if k not in config:
raise Exception(
"Unknown model config `{}`, all model configs: {}".format(
k, config.keys()))
config.update(json)
while not runner.is_finished():
runner.step()
print(runner.debug_string())
env_name = args.env
if args.alg == "PPO":
config = ppo.DEFAULT_CONFIG.copy()
_check_and_update(config, json_config)
alg = ppo.PPOAgent(
env_name, config, upload_dir=args.upload_dir)
elif args.alg == "ES":
config = es.DEFAULT_CONFIG.copy()
_check_and_update(config, json_config)
alg = es.ESAgent(
env_name, config, upload_dir=args.upload_dir)
elif args.alg == "DQN":
config = dqn.DEFAULT_CONFIG.copy()
_check_and_update(config, json_config)
alg = dqn.DQNAgent(
env_name, config, upload_dir=args.upload_dir)
elif args.alg == "A3C":
config = a3c.DEFAULT_CONFIG.copy()
_check_and_update(config, json_config)
alg = a3c.A3CAgent(
env_name, config, upload_dir=args.upload_dir)
else:
assert False, ("Unknown algorithm, check --alg argument. Valid "
"choices are PPO, ES, DQN and A3C.")
result_logger = ray.rllib.common.RLLibLogger(
os.path.join(alg.logdir, "result.json"))
if args.restore:
alg.restore(args.restore)
for i in range(args.num_iterations):
result = alg.train()
# We need to use a custom json serializer class so that NaNs get
# encoded as null as required by Athena.
json.dump(result._asdict(), result_logger,
cls=ray.rllib.common.RLLibEncoder)
result_logger.write("\n")
print("== Iteration {} ==".format(alg.iteration))
pprint.pprint(result._asdict())
if (i + 1) % args.checkpoint_freq == 0:
print("checkpoint path: {}".format(alg.save()))
for trial in runner.get_trials():
if trial.status != Trial.TERMINATED:
sys.exit(1)
@@ -0,0 +1,15 @@
cartpole-ppo:
env: CartPole-v0
alg: PPO
num_trials: 6
stop:
episode_reward_mean: 200
time_total_s: 180
resources:
cpu: 2
config:
num_workers: 2
num_sgd_iter:
grid_search: [1, 4]
sgd_batchsize:
grid_search: [128, 256, 512]
@@ -0,0 +1,8 @@
hopper-ppo:
env: Hopper-v1
alg: PPO
num_trials: 1
resources:
cpu: 64
gpu: 4
config: {"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}
@@ -0,0 +1,9 @@
humanoid-es:
env: Humanoid-v1
alg: ES
resources:
cpu: 100
stop:
episode_reward_mean: 6000
config:
num_workers: 100
@@ -0,0 +1,11 @@
humanoid-ppo-gae:
env: Humanoid-v1
alg: PPO
num_trials: 1
stop:
episode_reward_mean: 6000
resources:
cpu: 64
gpu: 4
config: {"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}
@@ -0,0 +1,10 @@
humanoid-ppo:
env: Humanoid-v1
alg: PPO
num_trials: 1
stop:
episode_reward_mean: 6000
resources:
cpu: 64
gpu: 4
config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}
@@ -0,0 +1,9 @@
pong-a3c:
env: PongDeterministic-v4
alg: A3C
resources:
cpu: 16
config:
num_workers: 16
num_batches_per_iteration: 1000
batch_size: 20
@@ -0,0 +1,9 @@
pong-dqn:
env: PongDeterministic-v4
alg: DQN
resources:
cpu: 1
gpu: 1
stop:
episode_reward_mean: 20
time_total_s: 7200
@@ -0,0 +1,8 @@
walker2d-v1-ppo:
env: Walker2d-v1
alg: PPO
num_trials: 1
resources:
cpu: 64
gpu: 4
config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}
+54
View File
@@ -0,0 +1,54 @@
Ray.tune: Fast hyperparameter search
====================================
Using ray.tune with RLlib
-------------------------
One way to use ray.tune is through RLlib's train.py script. The train.py script
supports two modes. For example, to run multiple concurrent trials of Pong:
- Inline args: ``./train.py --env=Pong-v0 --alg=PPO --num_trials=8 --stop '{"time_total_s": 3200}' --resources '{"cpu": 8, "gpu": 2}' --config '{"num_workers": 8, "sgd_num_iter": 10}'``
- File-based: ``./train.py -f tune-pong.yaml``
Both delegate scheduling of trials to the ray.tune TrialRunner class.
Additionally, the file-based mode supports hyper-parameter tuning
(currently just grid and random search).
To specify search parameters, variables in the `config` section may be set to
different values for each trial. You can either specify `grid_search: <list>`
in place of a concrete value to specify a grid search across the list of
values, or `eval: <str>` for values to be sampled from the given Python
expression.
.. code:: yaml
cartpole-ppo:
env: CartPole-v0
alg: PPO
num_trials: 6
stop:
episode_reward_mean: 200
time_total_s: 180
resources:
cpu: 4
config:
num_workers: 4
num_sgd_iter:
grid_search: [1, 4]
sgd_batchsize:
grid_search: [128, 256, 512]
lr:
eval: random.uniform(1e-4, 1e-3)
See ray/rllib/tuned_examples for more examples of configs in YAML form.
Using ray.tune to run custom scripts
------------------------------------
TODO
Using ray.tune as a library
---------------------------
TODO
View File
+138
View File
@@ -0,0 +1,138 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import json
import numpy as np
import os
import random
import sys
from ray.tune.trial import Trial, Resources
def _resource_json(data):
values = json.loads(data)
return Resources(values.get('cpu', 0), values.get('gpu', 0))
def make_parser(description):
"""Returns a base argument parser for the ray.tune tool."""
parser = argparse.ArgumentParser(description=(description))
parser.add_argument("--alg", default="PPO", type=str,
help="The learning algorithm to train.")
parser.add_argument("--stop", default="{}", type=json.loads,
help="The stopping criteria, specified in JSON.")
parser.add_argument("--config", default="{}", type=json.loads,
help="The config of the algorithm, specified in JSON.")
parser.add_argument("--resources", default='{"cpu": 1}',
type=_resource_json,
help="Amount of resources to allocate per trial.")
parser.add_argument("--num_trials", default=1, type=int,
help="Number of trials to evaluate.")
parser.add_argument("--local_dir", default="/tmp/ray", type=str,
help="Local dir to save training results to.")
parser.add_argument("--upload_dir", default=None, type=str,
help="URI to upload training results to.")
parser.add_argument("--checkpoint_freq", default=sys.maxsize, type=int,
help="How many iterations between checkpoints.")
# TODO(ekl) environments are RL specific
parser.add_argument("--env", default=None, type=str,
help="The gym environment to use.")
return parser
def parse_to_trials(config):
"""Parses a json config to the number of trials specified by the config.
The input config is a mapping from experiment names to an argument
dictionary describing a set of trials. These args include the parser args
documented in make_parser().
"""
def resolve(agent_cfg, resolved_vars, i):
assert type(agent_cfg) == dict
cfg = agent_cfg.copy()
for p, val in cfg.items():
if type(val) == dict and "eval" in val:
cfg[p] = eval(val["eval"], {
"random": random,
"np": np,
}, {
"_i": i,
})
resolved_vars[p] = True
return cfg, resolved_vars
def to_argv(config):
argv = []
for k, v in config.items():
argv.append("--{}".format(k))
if type(v) is str:
argv.append(v)
else:
argv.append(json.dumps(v))
return argv
def param_str(config, resolved_vars):
return "_".join(
[k + "=" + str(v) for k, v in sorted(config.items())
if resolved_vars.get(k)])
parser = make_parser("Ray hyperparameter tuning tool")
trials = []
for experiment_name, exp_cfg in config.items():
args = parser.parse_args(to_argv(exp_cfg))
grid_search = _GridSearchGenerator(args.config)
for i in range(args.num_trials):
next_cfg, resolved_vars = grid_search.next()
resolved, resolved_vars = resolve(next_cfg, resolved_vars, i)
if resolved_vars:
agent_id = "{}_{}".format(
i, param_str(resolved, resolved_vars))
else:
agent_id = str(i)
trials.append(Trial(
args.env, args.alg, resolved,
os.path.join(args.local_dir, experiment_name), agent_id,
args.resources, args.stop, args.checkpoint_freq, None,
args.upload_dir))
return trials
class _GridSearchGenerator(object):
"""Generator that implements grid search over a set of value lists."""
def __init__(self, agent_cfg):
self.cfg = agent_cfg
self.grid_values = []
for p, val in sorted(agent_cfg.items()):
if type(val) == dict and "grid_search" in val:
assert type(val["grid_search"] == list)
self.grid_values.append((p, val["grid_search"]))
self.value_indices = [0] * len(self.grid_values)
def next(self):
cfg = self.cfg.copy()
resolved_vars = {}
for i, (k, values) in enumerate(self.grid_values):
idx = self.value_indices[i]
cfg[k] = values[idx]
resolved_vars[k] = True
if self.grid_values:
self._increment(0)
return cfg, resolved_vars
def _increment(self, i):
self.value_indices[i] += 1
if self.value_indices[i] >= len(self.grid_values[i][1]):
self.value_indices[i] = 0
if i + 1 < len(self.value_indices):
self._increment(i + 1)
+167
View File
@@ -0,0 +1,167 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import traceback
import ray
from collections import namedtuple
from ray.rllib.agents import get_agent_class
# Ray resources required to schedule a Trial
Resources = namedtuple("Resources", ["cpu", "gpu"])
class Trial(object):
"""A trial object holds the state for one model training run.
Trials are themselves managed by the TrialRunner class, which implements
the event loop for submitting trial runs to a Ray cluster.
Trials start in the PENDING state, and transition to RUNNING once started.
On error it transitions to ERROR, otherwise TERMINATED on success.
"""
PENDING = 'PENDING'
RUNNING = 'RUNNING'
TERMINATED = 'TERMINATED'
ERROR = 'ERROR'
def __init__(
self, env_creator, alg, config={}, local_dir='/tmp/ray',
agent_id=None, resources=Resources(cpu=1, gpu=0),
stopping_criterion={}, checkpoint_freq=sys.maxsize,
restore_path=None, upload_dir=None):
"""Initialize a new trial.
The args here take the same meaning as the command line flags defined
in ray.tune.config_parser.
"""
# Immutable config
self.env_creator = env_creator
if type(env_creator) is str:
self.env_name = env_creator
else:
self.env_name = "custom"
self.alg = alg
self.config = config
self.local_dir = local_dir
self.agent_id = agent_id
self.resources = resources
self.stopping_criterion = stopping_criterion
self.checkpoint_freq = checkpoint_freq
self.restore_path = restore_path
self.upload_dir = upload_dir
# Local trial state that is updated during the run
self.last_result = None
self.checkpoint_path = None
self.agent = None
self.status = Trial.PENDING
def start(self):
"""Starts this trial.
If an error is encountered when starting the trial, an exception will
be thrown.
"""
self.status = Trial.RUNNING
agent_cls = get_agent_class(self.alg)
cls = ray.remote(
num_cpus=self.resources.cpu, num_gpus=self.resources.gpu)(
agent_cls)
self.agent = cls.remote(
self.env_creator, self.config, self.local_dir, self.upload_dir,
agent_id=self.agent_id)
if self.restore_path:
ray.get(self.agent.restore.remote(self.restore_path))
def stop(self, error=False):
"""Stops this trial.
Stops this trial, releasing all allocating resources. If stopping the
trial fails, the run will be marked as terminated in error, but no
exception will be thrown.
Args:
error (bool): Whether to mark this trial as terminated in error.
"""
if error:
self.status = Trial.ERROR
else:
self.status = Trial.TERMINATED
try:
if self.agent:
self.agent.stop.remote()
self.agent.__ray_terminate__.remote(
self.agent._ray_actor_id.id())
except:
print("Error stopping agent:", traceback.format_exc())
self.status = Trial.ERROR
finally:
self.agent = None
def train_remote(self):
"""Returns Ray future for one iteration of training."""
assert self.status == Trial.RUNNING, self.status
return self.agent.train.remote()
def should_stop(self, result):
"""Whether the given result meets this trial's stopping criteria."""
for criteria, stop_value in self.stopping_criterion.items():
if getattr(result, criteria) >= stop_value:
return True
return False
def should_checkpoint(self):
"""Whether this trial is due for checkpointing."""
if self.checkpoint_freq is None:
return False
return self.last_result.training_iteration % self.checkpoint_freq == 0
def progress_string(self):
"""Returns a progress message for printing out to the console."""
if self.last_result is None:
return self.status
return '{}, {} s, {} ts, {} itrs, {} rew'.format(
self.status,
int(self.last_result.time_total_s),
int(self.last_result.timesteps_total),
self.last_result.training_iteration,
round(self.last_result.episode_reward_mean, 1))
def checkpoint(self):
"""Synchronously checkpoints the state of this trial.
TODO(ekl): we should support a PAUSED state based on checkpointing.
"""
path = ray.get(self.agent.save.remote())
self.checkpoint_path = path
print("Saved checkpoint to:", path)
return path
def __str__(self):
identifier = '{}_{}'.format(self.alg, self.env_name)
if self.agent_id:
identifier += '_' + self.agent_id
return identifier
def __eq__(self, other):
return str(self) == str(other)
def __hash__(self):
return hash(str(self))
+182
View File
@@ -0,0 +1,182 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import ray
import time
import traceback
from ray.tune.trial import Trial, Resources
class TrialRunner(object):
"""A TrialRunner implements the event loop for scheduling trials on Ray.
Example:
runner = TrialRunner()
runner.add_trial(Trial(...))
runner.add_trial(Trial(...))
while not runner.is_finished():
runner.step()
print(runner.debug_string())
The main job of TrialRunner is scheduling trials to efficiently use cluster
resources, without overloading the cluster.
While Ray itself provides resource management for tasks and actors, this is
not sufficient when scheduling trials that may instantiate multiple actors.
This is because if insufficient resources are available, concurrent agents
could deadlock waiting for new resources to become available. Furthermore,
oversubscribing the cluster could degrade training performance, leading to
misleading benchmark results.
"""
def __init__(self):
"""Initializes a new TrialRunner."""
self._trials = []
self._pending = {}
self._avail_resources = Resources(cpu=0, gpu=0)
self._committed_resources = Resources(cpu=0, gpu=0)
def is_finished(self):
"""Returns whether all trials have finished running."""
for t in self._trials:
if t.status in [Trial.PENDING, Trial.RUNNING]:
return False
return True
def step(self):
"""Runs one step of the trial event loop.
Callers should typically run this method repeatedly in a loop. They
may inspect or modify the runner's state in between calls to step().
"""
if self._can_launch_more():
self._launch_trial()
elif self._pending:
self._process_events()
else:
for trial in self._trials:
if trial.status == Trial.PENDING:
assert self._has_resources(trial.resources), \
("Insufficient cluster resources to launch trial",
trial.resources)
assert False, "Called step when all trials finished?"
def get_trials(self):
"""Returns the list of trials managed by this TrialRunner.
Note that the caller usually should not mutate trial state directly.
"""
return self._trials
def add_trial(self, trial):
"""Adds a new trial to this TrialRunner.
Trials may be added at any time.
"""
self._trials.append(trial)
def debug_string(self):
"""Returns a human readable message for printing to the console."""
messages = ["== Status =="]
messages.append(
"Available: {}".format(self._avail_resources))
messages.append(
"Committed: {}".format(self._committed_resources))
for local_dir in sorted(set([t.local_dir for t in self._trials])):
messages.append("Tensorboard logdir: {}".format(local_dir))
for t in self._trials:
if t.local_dir == local_dir:
messages.append(
" - {}:\t{}".format(t, t.progress_string()))
return "\n".join(messages) + "\n"
def _can_launch_more(self):
self._update_avail_resources()
trial = self._get_runnable()
return trial is not None
def _launch_trial(self):
trial = self._get_runnable()
self._commit_resources(trial.resources)
try:
trial.start()
self._pending[trial.train_remote()] = trial
except:
print("Error starting agent, retrying:", traceback.format_exc())
time.sleep(2)
trial.stop(error=True)
try:
trial.start()
self._pending[trial.train_remote()] = trial
except:
print("Error starting agent, abort:", traceback.format_exc())
trial.stop(error=True)
# note that we don't return the resources, since they may
# have been lost
def _process_events(self):
[result_id], _ = ray.wait(self._pending.keys())
trial = self._pending[result_id]
del self._pending[result_id]
try:
result = ray.get(result_id)
print("result", result)
trial.last_result = result
if trial.should_stop(result):
self._return_resources(trial.resources)
trial.stop()
else:
# TODO(rliaw): This implements checkpoint in a blocking manner
if trial.should_checkpoint():
trial.checkpoint()
self._pending[trial.train_remote()] = trial
except:
print("Error processing event:", traceback.format_exc())
if trial.status == Trial.RUNNING:
self._return_resources(trial.resources)
trial.stop(error=True)
def _get_runnable(self):
for trial in self._trials:
if (trial.status == Trial.PENDING and
self._has_resources(trial.resources)):
return trial
return None
def _has_resources(self, resources):
cpu_avail = self._avail_resources.cpu - self._committed_resources.cpu
gpu_avail = self._avail_resources.gpu - self._committed_resources.gpu
assert cpu_avail >= 0 and gpu_avail >= 0
return resources.cpu <= cpu_avail and resources.gpu <= gpu_avail
def _commit_resources(self, resources):
self._committed_resources = Resources(
self._committed_resources.cpu + resources.cpu,
self._committed_resources.gpu + resources.gpu)
def _return_resources(self, resources):
self._committed_resources = Resources(
self._committed_resources.cpu - resources.cpu,
self._committed_resources.gpu - resources.gpu)
assert self._committed_resources.cpu >= 0
assert self._committed_resources.gpu >= 0
def _update_avail_resources(self):
clients = ray.global_state.client_table()
local_schedulers = [
entry for client in clients.values() for entry in client
if (entry['ClientType'] == 'local_scheduler' and not
entry['Deleted'])
]
num_cpus = sum(ls['NumCPUs'] for ls in local_schedulers)
num_gpus = sum(ls['NumGPUs'] for ls in local_schedulers)
self._avail_resources = Resources(int(num_cpus), int(num_gpus))