diff --git a/python/ray/rllib/a3c/example.py b/python/ray/rllib/a3c/example.py deleted file mode 100755 index 8b794ecf4..000000000 --- a/python/ray/rllib/a3c/example.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse - -import ray -from ray.rllib.a3c import A3C, DEFAULT_CONFIG - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the A3C algorithm.") - parser.add_argument("--environment", default="PongDeterministic-v4", - type=str, help="The gym environment to use.") - parser.add_argument("--redis-address", default=None, type=str, - help="The Redis address of the cluster.") - parser.add_argument("--num-workers", default=16, type=int, - help="The number of A3C workers to use.") - parser.add_argument("--iterations", default=-1, type=int, - help="The number of training iterations to run.") - - args = parser.parse_args() - ray.init(redis_address=args.redis_address, num_cpus=args.num_workers) - - config = DEFAULT_CONFIG.copy() - config["num_workers"] = args.num_workers - - a3c = A3C(args.environment, config) - - iteration = 0 - while iteration != args.iterations: - iteration += 1 - res = a3c.train() - print("current status: {}".format(res)) diff --git a/python/ray/rllib/dqn/example-cartpole.py b/python/ray/rllib/dqn/example-cartpole.py deleted file mode 100755 index 780a25569..000000000 --- a/python/ray/rllib/dqn/example-cartpole.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import tensorflow as tf - -import ray -from ray.rllib.dqn import DQN, DEFAULT_CONFIG - - -def main(): - parser = argparse.ArgumentParser(description="Run the A3C algorithm.") - parser.add_argument("--iterations", default=-1, type=int, - help="The number of training iterations to run.") - - args = parser.parse_args() - - config = DEFAULT_CONFIG.copy() - config.update(dict( - lr=1e-3, - schedule_max_timesteps=100000, - exploration_fraction=0.1, - exploration_final_eps=0.02, - dueling=False, - hiddens=[], - model_config=dict( - fcnet_hiddens=[64], - fcnet_activation=tf.nn.relu - ))) - - # Currently Ray is not used in this example, but we need to call ray.init - # to create the directory in which logging will occur. TODO(rkn): Fix this. - ray.init() - - dqn = DQN("CartPole-v0", config) - - iteration = 0 - while iteration != args.iterations: - iteration += 1 - res = dqn.train() - print("current status: {}".format(res)) - - -if __name__ == "__main__": - main() diff --git a/python/ray/rllib/dqn/example.py b/python/ray/rllib/dqn/example.py deleted file mode 100755 index 8d40eadfb..000000000 --- a/python/ray/rllib/dqn/example.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ray.rllib.dqn import DQN, DEFAULT_CONFIG - - -def main(): - config = DEFAULT_CONFIG.copy() - config.update(dict( - lr=1e-4, - schedule_max_timesteps=2000000, - buffer_size=10000, - exploration_fraction=0.1, - exploration_final_eps=0.01, - train_freq=4, - learning_starts=10000, - target_network_update_freq=1000, - gamma=0.99, - prioritized_replay=True)) - - dqn = DQN("PongNoFrameskip-v4", config) - - while True: - res = dqn.train() - print("current status: {}".format(res)) - - -if __name__ == '__main__': - main() diff --git a/python/ray/rllib/dqn/replay_buffer.py b/python/ray/rllib/dqn/replay_buffer.py index ff19f434e..1d4c49b2f 100644 --- a/python/ray/rllib/dqn/replay_buffer.py +++ b/python/ray/rllib/dqn/replay_buffer.py @@ -106,7 +106,7 @@ class PrioritizedReplayBuffer(ReplayBuffer): def add(self, *args, **kwargs): """See ReplayBuffer.store_effect""" idx = self._next_idx - super().add(*args, **kwargs) + super(PrioritizedReplayBuffer, self).add(*args, **kwargs) self._it_sum[idx] = self._max_priority ** self._alpha self._it_min[idx] = self._max_priority ** self._alpha diff --git a/python/ray/rllib/evolution_strategies/example.py b/python/ray/rllib/evolution_strategies/example.py deleted file mode 100755 index 99fc24966..000000000 --- a/python/ray/rllib/evolution_strategies/example.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse - -import ray -from ray.rllib.evolution_strategies import EvolutionStrategies, DEFAULT_CONFIG - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Train an RL agent on Pong.") - parser.add_argument("--num-workers", default=10, type=int, - help=("The number of actors to create in aggregate " - "across the cluster.")) - parser.add_argument("--env-name", default="Pendulum-v0", type=str, - help="The name of the gym environment to use.") - parser.add_argument("--stepsize", default=0.01, type=float, - help="The stepsize to use.") - parser.add_argument("--redis-address", default=None, type=str, - help="The Redis address of the cluster.") - parser.add_argument("--iterations", default=-1, type=int, - help="The number of training iterations to run.") - - args = parser.parse_args() - num_workers = args.num_workers - env_name = args.env_name - stepsize = args.stepsize - - ray.init(redis_address=args.redis_address, - num_workers=(0 if args.redis_address is None else None)) - - config = DEFAULT_CONFIG.copy() - config["num_workers"] = num_workers - config["stepsize"] = stepsize - - alg = EvolutionStrategies(env_name, config) - iteration = 0 - while iteration != args.iterations: - iteration += 1 - result = alg.train() - print("current status: {}".format(result)) diff --git a/python/ray/rllib/models/fcnet.py b/python/ray/rllib/models/fcnet.py index 13b2155f9..3faac9d38 100644 --- a/python/ray/rllib/models/fcnet.py +++ b/python/ray/rllib/models/fcnet.py @@ -33,7 +33,11 @@ class FullyConnectedNetwork(Model): def _init(self, inputs, num_outputs, options): hiddens = options.get("fcnet_hiddens", [256, 256]) - activation = options.get("fcnet_activation", tf.nn.tanh) + fcnet_activation = options.get("fcnet_activation", "tanh") + if fcnet_activation == "tanh": + activation = tf.nn.tanh + elif fcnet_activation == "relu": + activation = tf.nn.relu print("Constructing fcnet {} {}".format(hiddens, activation)) if options.get("free_logstd", False): diff --git a/python/ray/rllib/policy_gradient/example.py b/python/ray/rllib/policy_gradient/example.py deleted file mode 100755 index 2ddf70188..000000000 --- a/python/ray/rllib/policy_gradient/example.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse - -import ray -from ray.rllib.policy_gradient import PolicyGradient, DEFAULT_CONFIG - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run the policy gradient " - "algorithm.") - parser.add_argument("--environment", default="Pong-v0", type=str, - help="The gym environment to use.") - parser.add_argument("--redis-address", default=None, type=str, - help="The Redis address of the cluster.") - parser.add_argument("--use-tf-debugger", default=False, type=bool, - help="Run the script inside of tf-dbg.") - parser.add_argument("--load-checkpoint", default=None, type=str, - help="Continue training from a checkpoint.") - parser.add_argument("--iterations", default=None, type=int, - help="The number of training iterations to run.") - - args = parser.parse_args() - config = DEFAULT_CONFIG.copy() - config["use_tf_debugger"] = args.use_tf_debugger - if args.load_checkpoint is not None: - config["load_checkpoint"] = args.load_checkpoint - if args.iterations is not None: - config["max_iterations"] = args.iterations - - ray.init(redis_address=args.redis_address) - - alg = PolicyGradient(args.environment, config) - result = alg.train() - while result.training_iteration < config["max_iterations"]: - print("\n== iteration", result.training_iteration) - result = alg.train() - print("current status: {}".format(result)) diff --git a/python/ray/rllib/policy_gradient/policy_gradient.py b/python/ray/rllib/policy_gradient/policy_gradient.py index 4f62d38e2..204d4282a 100644 --- a/python/ray/rllib/policy_gradient/policy_gradient.py +++ b/python/ray/rllib/policy_gradient/policy_gradient.py @@ -28,8 +28,6 @@ DEFAULT_CONFIG = { "kl_coeff": 0.2, # Number of SGD iterations in each outer loop "num_sgd_iter": 30, - # Number of outer loop iterations - "max_iterations": 1000, # Stepsize of SGD "sgd_stepsize": 5e-5, # TODO(pcm): Expose the choice between gpus and cpus diff --git a/python/ray/rllib/train.py b/python/ray/rllib/train.py index e2de3ff4f..c9123843c 100755 --- a/python/ray/rllib/train.py +++ b/python/ray/rllib/train.py @@ -7,6 +7,7 @@ from __future__ import print_function import argparse import json import os +import sys import ray import ray.rllib.policy_gradient as pg @@ -16,36 +17,45 @@ import ray.rllib.a3c as a3c parser = argparse.ArgumentParser( description=("Train a reinforcement learning agent.")) -parser.add_argument("--env", required=True, type=str) -parser.add_argument("--alg", required=True, type=str) -parser.add_argument("--config", default="{}", type=str) -parser.add_argument("--upload-dir", default="file:///tmp/ray", type=str) +parser.add_argument("--redis-address", default=None, type=str, + help="The Redis address of the cluster.") +parser.add_argument("--env", required=True, type=str, + help="The gym environment to use.") +parser.add_argument("--alg", required=True, type=str, + help="The reinforcement learning algorithm to use.") +parser.add_argument("--num-iterations", default=sys.maxsize, type=int, + help="The number of training iterations to run.") +parser.add_argument("--config", default="{}", type=str, + help="The configuration options of the algorithm.") +parser.add_argument("--upload-dir", default="file:///tmp/ray", type=str, + help="Where the traces are stored.") if __name__ == "__main__": args = parser.parse_args() + json_config = json.loads(args.config) - ray.init() + ray.init(redis_address=args.redis_address) env_name = args.env if args.alg == "PolicyGradient": config = pg.DEFAULT_CONFIG.copy() - config.update(json.loads(args.config)) + config.update(json_config) alg = pg.PolicyGradient( env_name, config, upload_dir=args.upload_dir) elif args.alg == "EvolutionStrategies": config = es.DEFAULT_CONFIG.copy() - config.update(json.loads(args.config)) + config.update(json_config) alg = es.EvolutionStrategies( env_name, config, upload_dir=args.upload_dir) elif args.alg == "DQN": config = dqn.DEFAULT_CONFIG.copy() - config.update(json.loads(args.config)) + config.update(json_config) alg = dqn.DQN( env_name, config, upload_dir=args.upload_dir) elif args.alg == "A3C": config = a3c.DEFAULT_CONFIG.copy() - config.update(json.loads(args.config)) + config.update(json_config) alg = a3c.A3C( env_name, config, upload_dir=args.upload_dir) else: @@ -56,7 +66,7 @@ if __name__ == "__main__": result_logger = ray.rllib.common.RLLibLogger( os.path.join(alg.logdir, "result.json")) - while True: + for i in range(args.num_iterations): result = alg.train() # We need to use a custom json serializer class so that NaNs get @@ -64,3 +74,5 @@ if __name__ == "__main__": json.dump(result._asdict(), result_logger, cls=ray.rllib.common.RLLibEncoder) result_logger.write("\n") + + print("current status: {}".format(result)) diff --git a/test/jenkins_tests/run_multi_node_tests.sh b/test/jenkins_tests/run_multi_node_tests.sh index 094474662..ebf5016c1 100755 --- a/test/jenkins_tests/run_multi_node_tests.sh +++ b/test/jenkins_tests/run_multi_node_tests.sh @@ -57,19 +57,36 @@ python $ROOT_DIR/multi_node_docker_test.py \ # python /ray/examples/hyperopt/hyperopt_adaptive.py docker run --shm-size=10G --memory=10G $DOCKER_SHA \ - python /ray/python/ray/rllib/a3c/example.py \ - --environment=PongDeterministic-v0 \ - --iterations=2 - -# docker run --shm-size=10G --memory=10G $DOCKER_SHA \ -# python /ray/python/ray/rllib/policy_gradient/example.py \ -# --iterations=2 + python /ray/python/ray/rllib/train.py \ + --env PongDeterministic-v0 \ + --alg A3C \ + --num-iterations 2 \ + --config '{"num_workers": 16}' docker run --shm-size=10G --memory=10G $DOCKER_SHA \ - python /ray/python/ray/rllib/evolution_strategies/example.py \ - --env-name=Pendulum-v0 \ - --iterations=2 + python /ray/python/ray/rllib/train.py \ + --env CartPole-v1 \ + --alg PolicyGradient \ + --num-iterations 2 \ + --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "sgd_stepsize": 1e-4, "sgd_batchsize": 64, "timesteps_per_batch": 2000, "num_agents": 1}' docker run --shm-size=10G --memory=10G $DOCKER_SHA \ - python /ray/python/ray/rllib/dqn/example-cartpole.py \ - --iterations=2 + python /ray/python/ray/rllib/train.py \ + --env Pendulum-v0 \ + --alg EvolutionStrategies \ + --num-iterations 2 \ + --config '{"stepsize": 0.01}' + +docker run --shm-size=10G --memory=10G $DOCKER_SHA \ + python /ray/python/ray/rllib/train.py \ + --env CartPole-v0 \ + --alg DQN \ + --num-iterations 2 \ + --config '{"lr": 1e-3, "schedule_max_timesteps": 100000, "exploration_fraction": 0.1, "exploration_final_eps": 0.02, "dueling": false, "hiddens": [], "model_config": {"fcnet_hiddens": [64], "fcnet_activation": "relu"}}' + +docker run --shm-size=10G --memory=10G $DOCKER_SHA \ + python /ray/python/ray/rllib/train.py \ + --env PongNoFrameskip-v4 \ + --alg DQN \ + --num-iterations 2 \ + --config '{"lr": 1e-4, "schedule_max_timesteps": 2000000, "buffer_size": 10000, "exploration_fraction": 0.1, "exploration_final_eps": 0.01, "train_freq": 4, "learning_starts": 10000, "target_network_update_freq": 1000, "gamma": 0.99, "prioritized_replay": true}'