mirror of
https://github.com/wassname/ray.git
synced 2026-07-02 11:20:09 +08:00
[rllib] Unify RLLib examples and add jenkins test for policy gradients (#815)
* add jenkins test * correct handling of the number of iterations * convert policy gradient and evolution strategies script * convert DQN * fix A3C * fix * fix * fixes * remove redundant A3C example
This commit is contained in:
committed by
Robert Nishihara
parent
dbe3d9351c
commit
862e56000b
@@ -1,36 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
|
||||
import ray
|
||||
from ray.rllib.a3c import A3C, DEFAULT_CONFIG
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run the A3C algorithm.")
|
||||
parser.add_argument("--environment", default="PongDeterministic-v4",
|
||||
type=str, help="The gym environment to use.")
|
||||
parser.add_argument("--redis-address", default=None, type=str,
|
||||
help="The Redis address of the cluster.")
|
||||
parser.add_argument("--num-workers", default=16, type=int,
|
||||
help="The number of A3C workers to use.")
|
||||
parser.add_argument("--iterations", default=-1, type=int,
|
||||
help="The number of training iterations to run.")
|
||||
|
||||
args = parser.parse_args()
|
||||
ray.init(redis_address=args.redis_address, num_cpus=args.num_workers)
|
||||
|
||||
config = DEFAULT_CONFIG.copy()
|
||||
config["num_workers"] = args.num_workers
|
||||
|
||||
a3c = A3C(args.environment, config)
|
||||
|
||||
iteration = 0
|
||||
while iteration != args.iterations:
|
||||
iteration += 1
|
||||
res = a3c.train()
|
||||
print("current status: {}".format(res))
|
||||
@@ -1,48 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import tensorflow as tf
|
||||
|
||||
import ray
|
||||
from ray.rllib.dqn import DQN, DEFAULT_CONFIG
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run the A3C algorithm.")
|
||||
parser.add_argument("--iterations", default=-1, type=int,
|
||||
help="The number of training iterations to run.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
config = DEFAULT_CONFIG.copy()
|
||||
config.update(dict(
|
||||
lr=1e-3,
|
||||
schedule_max_timesteps=100000,
|
||||
exploration_fraction=0.1,
|
||||
exploration_final_eps=0.02,
|
||||
dueling=False,
|
||||
hiddens=[],
|
||||
model_config=dict(
|
||||
fcnet_hiddens=[64],
|
||||
fcnet_activation=tf.nn.relu
|
||||
)))
|
||||
|
||||
# Currently Ray is not used in this example, but we need to call ray.init
|
||||
# to create the directory in which logging will occur. TODO(rkn): Fix this.
|
||||
ray.init()
|
||||
|
||||
dqn = DQN("CartPole-v0", config)
|
||||
|
||||
iteration = 0
|
||||
while iteration != args.iterations:
|
||||
iteration += 1
|
||||
res = dqn.train()
|
||||
print("current status: {}".format(res))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,32 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.dqn import DQN, DEFAULT_CONFIG
|
||||
|
||||
|
||||
def main():
|
||||
config = DEFAULT_CONFIG.copy()
|
||||
config.update(dict(
|
||||
lr=1e-4,
|
||||
schedule_max_timesteps=2000000,
|
||||
buffer_size=10000,
|
||||
exploration_fraction=0.1,
|
||||
exploration_final_eps=0.01,
|
||||
train_freq=4,
|
||||
learning_starts=10000,
|
||||
target_network_update_freq=1000,
|
||||
gamma=0.99,
|
||||
prioritized_replay=True))
|
||||
|
||||
dqn = DQN("PongNoFrameskip-v4", config)
|
||||
|
||||
while True:
|
||||
res = dqn.train()
|
||||
print("current status: {}".format(res))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -106,7 +106,7 @@ class PrioritizedReplayBuffer(ReplayBuffer):
|
||||
def add(self, *args, **kwargs):
|
||||
"""See ReplayBuffer.store_effect"""
|
||||
idx = self._next_idx
|
||||
super().add(*args, **kwargs)
|
||||
super(PrioritizedReplayBuffer, self).add(*args, **kwargs)
|
||||
self._it_sum[idx] = self._max_priority ** self._alpha
|
||||
self._it_min[idx] = self._max_priority ** self._alpha
|
||||
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
|
||||
import ray
|
||||
from ray.rllib.evolution_strategies import EvolutionStrategies, DEFAULT_CONFIG
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Train an RL agent on Pong.")
|
||||
parser.add_argument("--num-workers", default=10, type=int,
|
||||
help=("The number of actors to create in aggregate "
|
||||
"across the cluster."))
|
||||
parser.add_argument("--env-name", default="Pendulum-v0", type=str,
|
||||
help="The name of the gym environment to use.")
|
||||
parser.add_argument("--stepsize", default=0.01, type=float,
|
||||
help="The stepsize to use.")
|
||||
parser.add_argument("--redis-address", default=None, type=str,
|
||||
help="The Redis address of the cluster.")
|
||||
parser.add_argument("--iterations", default=-1, type=int,
|
||||
help="The number of training iterations to run.")
|
||||
|
||||
args = parser.parse_args()
|
||||
num_workers = args.num_workers
|
||||
env_name = args.env_name
|
||||
stepsize = args.stepsize
|
||||
|
||||
ray.init(redis_address=args.redis_address,
|
||||
num_workers=(0 if args.redis_address is None else None))
|
||||
|
||||
config = DEFAULT_CONFIG.copy()
|
||||
config["num_workers"] = num_workers
|
||||
config["stepsize"] = stepsize
|
||||
|
||||
alg = EvolutionStrategies(env_name, config)
|
||||
iteration = 0
|
||||
while iteration != args.iterations:
|
||||
iteration += 1
|
||||
result = alg.train()
|
||||
print("current status: {}".format(result))
|
||||
@@ -33,7 +33,11 @@ class FullyConnectedNetwork(Model):
|
||||
|
||||
def _init(self, inputs, num_outputs, options):
|
||||
hiddens = options.get("fcnet_hiddens", [256, 256])
|
||||
activation = options.get("fcnet_activation", tf.nn.tanh)
|
||||
fcnet_activation = options.get("fcnet_activation", "tanh")
|
||||
if fcnet_activation == "tanh":
|
||||
activation = tf.nn.tanh
|
||||
elif fcnet_activation == "relu":
|
||||
activation = tf.nn.relu
|
||||
print("Constructing fcnet {} {}".format(hiddens, activation))
|
||||
|
||||
if options.get("free_logstd", False):
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
|
||||
import ray
|
||||
from ray.rllib.policy_gradient import PolicyGradient, DEFAULT_CONFIG
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run the policy gradient "
|
||||
"algorithm.")
|
||||
parser.add_argument("--environment", default="Pong-v0", type=str,
|
||||
help="The gym environment to use.")
|
||||
parser.add_argument("--redis-address", default=None, type=str,
|
||||
help="The Redis address of the cluster.")
|
||||
parser.add_argument("--use-tf-debugger", default=False, type=bool,
|
||||
help="Run the script inside of tf-dbg.")
|
||||
parser.add_argument("--load-checkpoint", default=None, type=str,
|
||||
help="Continue training from a checkpoint.")
|
||||
parser.add_argument("--iterations", default=None, type=int,
|
||||
help="The number of training iterations to run.")
|
||||
|
||||
args = parser.parse_args()
|
||||
config = DEFAULT_CONFIG.copy()
|
||||
config["use_tf_debugger"] = args.use_tf_debugger
|
||||
if args.load_checkpoint is not None:
|
||||
config["load_checkpoint"] = args.load_checkpoint
|
||||
if args.iterations is not None:
|
||||
config["max_iterations"] = args.iterations
|
||||
|
||||
ray.init(redis_address=args.redis_address)
|
||||
|
||||
alg = PolicyGradient(args.environment, config)
|
||||
result = alg.train()
|
||||
while result.training_iteration < config["max_iterations"]:
|
||||
print("\n== iteration", result.training_iteration)
|
||||
result = alg.train()
|
||||
print("current status: {}".format(result))
|
||||
@@ -28,8 +28,6 @@ DEFAULT_CONFIG = {
|
||||
"kl_coeff": 0.2,
|
||||
# Number of SGD iterations in each outer loop
|
||||
"num_sgd_iter": 30,
|
||||
# Number of outer loop iterations
|
||||
"max_iterations": 1000,
|
||||
# Stepsize of SGD
|
||||
"sgd_stepsize": 5e-5,
|
||||
# TODO(pcm): Expose the choice between gpus and cpus
|
||||
|
||||
+22
-10
@@ -7,6 +7,7 @@ from __future__ import print_function
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import ray
|
||||
import ray.rllib.policy_gradient as pg
|
||||
@@ -16,36 +17,45 @@ import ray.rllib.a3c as a3c
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description=("Train a reinforcement learning agent."))
|
||||
parser.add_argument("--env", required=True, type=str)
|
||||
parser.add_argument("--alg", required=True, type=str)
|
||||
parser.add_argument("--config", default="{}", type=str)
|
||||
parser.add_argument("--upload-dir", default="file:///tmp/ray", type=str)
|
||||
parser.add_argument("--redis-address", default=None, type=str,
|
||||
help="The Redis address of the cluster.")
|
||||
parser.add_argument("--env", required=True, type=str,
|
||||
help="The gym environment to use.")
|
||||
parser.add_argument("--alg", required=True, type=str,
|
||||
help="The reinforcement learning algorithm to use.")
|
||||
parser.add_argument("--num-iterations", default=sys.maxsize, type=int,
|
||||
help="The number of training iterations to run.")
|
||||
parser.add_argument("--config", default="{}", type=str,
|
||||
help="The configuration options of the algorithm.")
|
||||
parser.add_argument("--upload-dir", default="file:///tmp/ray", type=str,
|
||||
help="Where the traces are stored.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
json_config = json.loads(args.config)
|
||||
|
||||
ray.init()
|
||||
ray.init(redis_address=args.redis_address)
|
||||
|
||||
env_name = args.env
|
||||
if args.alg == "PolicyGradient":
|
||||
config = pg.DEFAULT_CONFIG.copy()
|
||||
config.update(json.loads(args.config))
|
||||
config.update(json_config)
|
||||
alg = pg.PolicyGradient(
|
||||
env_name, config, upload_dir=args.upload_dir)
|
||||
elif args.alg == "EvolutionStrategies":
|
||||
config = es.DEFAULT_CONFIG.copy()
|
||||
config.update(json.loads(args.config))
|
||||
config.update(json_config)
|
||||
alg = es.EvolutionStrategies(
|
||||
env_name, config, upload_dir=args.upload_dir)
|
||||
elif args.alg == "DQN":
|
||||
config = dqn.DEFAULT_CONFIG.copy()
|
||||
config.update(json.loads(args.config))
|
||||
config.update(json_config)
|
||||
alg = dqn.DQN(
|
||||
env_name, config, upload_dir=args.upload_dir)
|
||||
elif args.alg == "A3C":
|
||||
config = a3c.DEFAULT_CONFIG.copy()
|
||||
config.update(json.loads(args.config))
|
||||
config.update(json_config)
|
||||
alg = a3c.A3C(
|
||||
env_name, config, upload_dir=args.upload_dir)
|
||||
else:
|
||||
@@ -56,7 +66,7 @@ if __name__ == "__main__":
|
||||
result_logger = ray.rllib.common.RLLibLogger(
|
||||
os.path.join(alg.logdir, "result.json"))
|
||||
|
||||
while True:
|
||||
for i in range(args.num_iterations):
|
||||
result = alg.train()
|
||||
|
||||
# We need to use a custom json serializer class so that NaNs get
|
||||
@@ -64,3 +74,5 @@ if __name__ == "__main__":
|
||||
json.dump(result._asdict(), result_logger,
|
||||
cls=ray.rllib.common.RLLibEncoder)
|
||||
result_logger.write("\n")
|
||||
|
||||
print("current status: {}".format(result))
|
||||
|
||||
Reference in New Issue
Block a user