[rllib] Unify RLLib examples and add jenkins test for policy gradients (#815)

* add jenkins test

* correct handling of the number of iterations

* convert policy gradient and evolution strategies script

* convert DQN

* fix A3C

* fix

* fix

* fixes

* remove redundant A3C example
This commit is contained in:
Philipp Moritz
2017-08-07 19:05:48 -07:00
committed by Robert Nishihara
parent dbe3d9351c
commit 862e56000b
10 changed files with 57 additions and 228 deletions
-36
View File
@@ -1,36 +0,0 @@
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import ray
from ray.rllib.a3c import A3C, DEFAULT_CONFIG
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the A3C algorithm.")
parser.add_argument("--environment", default="PongDeterministic-v4",
type=str, help="The gym environment to use.")
parser.add_argument("--redis-address", default=None, type=str,
help="The Redis address of the cluster.")
parser.add_argument("--num-workers", default=16, type=int,
help="The number of A3C workers to use.")
parser.add_argument("--iterations", default=-1, type=int,
help="The number of training iterations to run.")
args = parser.parse_args()
ray.init(redis_address=args.redis_address, num_cpus=args.num_workers)
config = DEFAULT_CONFIG.copy()
config["num_workers"] = args.num_workers
a3c = A3C(args.environment, config)
iteration = 0
while iteration != args.iterations:
iteration += 1
res = a3c.train()
print("current status: {}".format(res))
-48
View File
@@ -1,48 +0,0 @@
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import tensorflow as tf
import ray
from ray.rllib.dqn import DQN, DEFAULT_CONFIG
def main():
parser = argparse.ArgumentParser(description="Run the A3C algorithm.")
parser.add_argument("--iterations", default=-1, type=int,
help="The number of training iterations to run.")
args = parser.parse_args()
config = DEFAULT_CONFIG.copy()
config.update(dict(
lr=1e-3,
schedule_max_timesteps=100000,
exploration_fraction=0.1,
exploration_final_eps=0.02,
dueling=False,
hiddens=[],
model_config=dict(
fcnet_hiddens=[64],
fcnet_activation=tf.nn.relu
)))
# Currently Ray is not used in this example, but we need to call ray.init
# to create the directory in which logging will occur. TODO(rkn): Fix this.
ray.init()
dqn = DQN("CartPole-v0", config)
iteration = 0
while iteration != args.iterations:
iteration += 1
res = dqn.train()
print("current status: {}".format(res))
if __name__ == "__main__":
main()
-32
View File
@@ -1,32 +0,0 @@
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.dqn import DQN, DEFAULT_CONFIG
def main():
config = DEFAULT_CONFIG.copy()
config.update(dict(
lr=1e-4,
schedule_max_timesteps=2000000,
buffer_size=10000,
exploration_fraction=0.1,
exploration_final_eps=0.01,
train_freq=4,
learning_starts=10000,
target_network_update_freq=1000,
gamma=0.99,
prioritized_replay=True))
dqn = DQN("PongNoFrameskip-v4", config)
while True:
res = dqn.train()
print("current status: {}".format(res))
if __name__ == '__main__':
main()
+1 -1
View File
@@ -106,7 +106,7 @@ class PrioritizedReplayBuffer(ReplayBuffer):
def add(self, *args, **kwargs):
"""See ReplayBuffer.store_effect"""
idx = self._next_idx
super().add(*args, **kwargs)
super(PrioritizedReplayBuffer, self).add(*args, **kwargs)
self._it_sum[idx] = self._max_priority ** self._alpha
self._it_min[idx] = self._max_priority ** self._alpha
@@ -1,44 +0,0 @@
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import ray
from ray.rllib.evolution_strategies import EvolutionStrategies, DEFAULT_CONFIG
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Train an RL agent on Pong.")
parser.add_argument("--num-workers", default=10, type=int,
help=("The number of actors to create in aggregate "
"across the cluster."))
parser.add_argument("--env-name", default="Pendulum-v0", type=str,
help="The name of the gym environment to use.")
parser.add_argument("--stepsize", default=0.01, type=float,
help="The stepsize to use.")
parser.add_argument("--redis-address", default=None, type=str,
help="The Redis address of the cluster.")
parser.add_argument("--iterations", default=-1, type=int,
help="The number of training iterations to run.")
args = parser.parse_args()
num_workers = args.num_workers
env_name = args.env_name
stepsize = args.stepsize
ray.init(redis_address=args.redis_address,
num_workers=(0 if args.redis_address is None else None))
config = DEFAULT_CONFIG.copy()
config["num_workers"] = num_workers
config["stepsize"] = stepsize
alg = EvolutionStrategies(env_name, config)
iteration = 0
while iteration != args.iterations:
iteration += 1
result = alg.train()
print("current status: {}".format(result))
+5 -1
View File
@@ -33,7 +33,11 @@ class FullyConnectedNetwork(Model):
def _init(self, inputs, num_outputs, options):
hiddens = options.get("fcnet_hiddens", [256, 256])
activation = options.get("fcnet_activation", tf.nn.tanh)
fcnet_activation = options.get("fcnet_activation", "tanh")
if fcnet_activation == "tanh":
activation = tf.nn.tanh
elif fcnet_activation == "relu":
activation = tf.nn.relu
print("Constructing fcnet {} {}".format(hiddens, activation))
if options.get("free_logstd", False):
@@ -1,42 +0,0 @@
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import ray
from ray.rllib.policy_gradient import PolicyGradient, DEFAULT_CONFIG
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the policy gradient "
"algorithm.")
parser.add_argument("--environment", default="Pong-v0", type=str,
help="The gym environment to use.")
parser.add_argument("--redis-address", default=None, type=str,
help="The Redis address of the cluster.")
parser.add_argument("--use-tf-debugger", default=False, type=bool,
help="Run the script inside of tf-dbg.")
parser.add_argument("--load-checkpoint", default=None, type=str,
help="Continue training from a checkpoint.")
parser.add_argument("--iterations", default=None, type=int,
help="The number of training iterations to run.")
args = parser.parse_args()
config = DEFAULT_CONFIG.copy()
config["use_tf_debugger"] = args.use_tf_debugger
if args.load_checkpoint is not None:
config["load_checkpoint"] = args.load_checkpoint
if args.iterations is not None:
config["max_iterations"] = args.iterations
ray.init(redis_address=args.redis_address)
alg = PolicyGradient(args.environment, config)
result = alg.train()
while result.training_iteration < config["max_iterations"]:
print("\n== iteration", result.training_iteration)
result = alg.train()
print("current status: {}".format(result))
@@ -28,8 +28,6 @@ DEFAULT_CONFIG = {
"kl_coeff": 0.2,
# Number of SGD iterations in each outer loop
"num_sgd_iter": 30,
# Number of outer loop iterations
"max_iterations": 1000,
# Stepsize of SGD
"sgd_stepsize": 5e-5,
# TODO(pcm): Expose the choice between gpus and cpus
+22 -10
View File
@@ -7,6 +7,7 @@ from __future__ import print_function
import argparse
import json
import os
import sys
import ray
import ray.rllib.policy_gradient as pg
@@ -16,36 +17,45 @@ import ray.rllib.a3c as a3c
parser = argparse.ArgumentParser(
description=("Train a reinforcement learning agent."))
parser.add_argument("--env", required=True, type=str)
parser.add_argument("--alg", required=True, type=str)
parser.add_argument("--config", default="{}", type=str)
parser.add_argument("--upload-dir", default="file:///tmp/ray", type=str)
parser.add_argument("--redis-address", default=None, type=str,
help="The Redis address of the cluster.")
parser.add_argument("--env", required=True, type=str,
help="The gym environment to use.")
parser.add_argument("--alg", required=True, type=str,
help="The reinforcement learning algorithm to use.")
parser.add_argument("--num-iterations", default=sys.maxsize, type=int,
help="The number of training iterations to run.")
parser.add_argument("--config", default="{}", type=str,
help="The configuration options of the algorithm.")
parser.add_argument("--upload-dir", default="file:///tmp/ray", type=str,
help="Where the traces are stored.")
if __name__ == "__main__":
args = parser.parse_args()
json_config = json.loads(args.config)
ray.init()
ray.init(redis_address=args.redis_address)
env_name = args.env
if args.alg == "PolicyGradient":
config = pg.DEFAULT_CONFIG.copy()
config.update(json.loads(args.config))
config.update(json_config)
alg = pg.PolicyGradient(
env_name, config, upload_dir=args.upload_dir)
elif args.alg == "EvolutionStrategies":
config = es.DEFAULT_CONFIG.copy()
config.update(json.loads(args.config))
config.update(json_config)
alg = es.EvolutionStrategies(
env_name, config, upload_dir=args.upload_dir)
elif args.alg == "DQN":
config = dqn.DEFAULT_CONFIG.copy()
config.update(json.loads(args.config))
config.update(json_config)
alg = dqn.DQN(
env_name, config, upload_dir=args.upload_dir)
elif args.alg == "A3C":
config = a3c.DEFAULT_CONFIG.copy()
config.update(json.loads(args.config))
config.update(json_config)
alg = a3c.A3C(
env_name, config, upload_dir=args.upload_dir)
else:
@@ -56,7 +66,7 @@ if __name__ == "__main__":
result_logger = ray.rllib.common.RLLibLogger(
os.path.join(alg.logdir, "result.json"))
while True:
for i in range(args.num_iterations):
result = alg.train()
# We need to use a custom json serializer class so that NaNs get
@@ -64,3 +74,5 @@ if __name__ == "__main__":
json.dump(result._asdict(), result_logger,
cls=ray.rllib.common.RLLibEncoder)
result_logger.write("\n")
print("current status: {}".format(result))