From df65e87fc71c21bbce837063144250073c1cfa37 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 3 Aug 2017 16:34:06 -0700 Subject: [PATCH] [rllib] Tune ppo more on control tasks (#777) * tune ppo on control tasks * introduce free log_std * fix * flag for writing logs * fixes * fixes --- python/ray/rllib/models/catalog.py | 1 + python/ray/rllib/models/fcnet.py | 19 ++++++++++++++++++- python/ray/rllib/policy_gradient/loss.py | 2 +- .../rllib/policy_gradient/policy_gradient.py | 13 ++++++++++--- python/ray/rllib/test.sh | 10 ++++++++-- 5 files changed, 38 insertions(+), 7 deletions(-) diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py index 1c1de23cf..a5548643f 100644 --- a/python/ray/rllib/models/catalog.py +++ b/python/ray/rllib/models/catalog.py @@ -26,6 +26,7 @@ class ModelCatalog(object): Args: action_space (Space): Action space of the target gym env. + dist_type (Optional[str]): Identifier of the action distribution. Returns: dist_class (ActionDistribution): Python class of the distribution. diff --git a/python/ray/rllib/models/fcnet.py b/python/ray/rllib/models/fcnet.py index 34d599aea..13b2155f9 100644 --- a/python/ray/rllib/models/fcnet.py +++ b/python/ray/rllib/models/fcnet.py @@ -19,13 +19,26 @@ def normc_initializer(std=1.0): class FullyConnectedNetwork(Model): - """Generic fully connected network.""" + """Generic fully connected network. + + Options to construct the network are passed to the _init function. + If options["free_logstd"] is True, the last half of the + output layer will be free variables that are not dependent on + inputs. This is often used if the output of the network is used + to parametrize a probability distribution. In this case, the + first half of the parameters can be interpreted as a location + parameter (like a mean) and the second half can be interpreted as + a scale parameter (like a standard deviation). + """ def _init(self, inputs, num_outputs, options): hiddens = options.get("fcnet_hiddens", [256, 256]) activation = options.get("fcnet_activation", tf.nn.tanh) print("Constructing fcnet {} {}".format(hiddens, activation)) + if options.get("free_logstd", False): + num_outputs = num_outputs // 2 + with tf.name_scope("fc_net"): i = 1 last_layer = inputs @@ -40,4 +53,8 @@ class FullyConnectedNetwork(Model): last_layer, num_outputs, weights_initializer=normc_initializer(0.01), activation_fn=None, scope="fc_out") + if options.get("free_logstd", False): + logstd = tf.get_variable(name="logstd", shape=[num_outputs], + initializer=tf.zeros_initializer) + output = tf.concat([output, 0.0 * output + logstd], 1) return output, last_layer diff --git a/python/ray/rllib/policy_gradient/loss.py b/python/ray/rllib/policy_gradient/loss.py index ef40f577a..dfb82d071 100644 --- a/python/ray/rllib/policy_gradient/loss.py +++ b/python/ray/rllib/policy_gradient/loss.py @@ -22,7 +22,7 @@ class ProximalPolicyLoss(object): self.observations = observations self.curr_logits = ModelCatalog.get_model( - observations, logit_dim).outputs + observations, logit_dim, config["model"]).outputs self.curr_dist = distribution_class(self.curr_logits) self.sampler = self.curr_dist.sample() diff --git a/python/ray/rllib/policy_gradient/policy_gradient.py b/python/ray/rllib/policy_gradient/policy_gradient.py index df2e2b0ee..47af6eacf 100644 --- a/python/ray/rllib/policy_gradient/policy_gradient.py +++ b/python/ray/rllib/policy_gradient/policy_gradient.py @@ -18,6 +18,7 @@ from ray.rllib.policy_gradient.utils import shuffle DEFAULT_CONFIG = { + "gamma": 0.995, "kl_coeff": 0.2, "num_sgd_iter": 30, "max_iterations": 1000, @@ -34,11 +35,13 @@ DEFAULT_CONFIG = { "entropy_coeff": 0.0, "clip_param": 0.3, "kl_target": 0.01, + "model": {"free_logstd": False}, "timesteps_per_batch": 40000, "num_agents": 5, "full_trace_nth_sgd_batch": -1, "full_trace_data_load": False, "use_tf_debugger": False, + "write_logs": True, # write checkpoints and tensorflow logging? "model_checkpoint_file": "iteration-%s.ckpt"} @@ -53,7 +56,9 @@ class PolicyGradient(Algorithm): preprocessor = AtariPixelPreprocessor() elif self.env_name == "Pong-ram-v3": preprocessor = AtariRamPreprocessor() - elif self.env_name == "CartPole-v0": + elif self.env_name == "CartPole-v0" or self.env_name == "CartPole-v1": + preprocessor = NoPreprocessor() + elif self.env_name == "Hopper-v1": preprocessor = NoPreprocessor() elif self.env_name == "Walker2d-v1": preprocessor = NoPreprocessor() @@ -82,12 +87,14 @@ class PolicyGradient(Algorithm): j = self.j self.j += 1 + print("===> iteration", self.j) + saver = tf.train.Saver(max_to_keep=None) if "load_checkpoint" in config: saver.restore(model.sess, config["load_checkpoint"]) # TF does not support to write logs to S3 at the moment - write_tf_logs = self.logdir.startswith("file") + write_tf_logs = config["write_logs"] and self.logdir.startswith("file") iter_start = time.time() if write_tf_logs: file_writer = tf.summary.FileWriter(self.logdir, model.sess.graph) @@ -101,7 +108,7 @@ class PolicyGradient(Algorithm): weights = ray.put(model.get_weights()) [a.load_weights.remote(weights) for a in agents] trajectory, total_reward, traj_len_mean = collect_samples( - agents, config["timesteps_per_batch"], 0.995, 1.0, 2000) + agents, config["timesteps_per_batch"], config["gamma"], 1.0, 2000) print("total reward is ", total_reward) print("trajectory length mean is ", traj_len_mean) print("timesteps:", trajectory["dones"].shape[0]) diff --git a/python/ray/rllib/test.sh b/python/ray/rllib/test.sh index 2e11229cd..f721dc4c4 100755 --- a/python/ray/rllib/test.sh +++ b/python/ray/rllib/test.sh @@ -1,7 +1,13 @@ #!/bin/bash -python train.py --env Walker2d-v1 --alg PolicyGradient --upload-dir s3://bucketname/ -python train.py --env Humanoid-v1 --alg PolicyGradient --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64}' --upload-dir s3://bucketname/ +python train.py --env Hopper-v1 --config '{"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/ + +python train.py --env CartPole-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/ + +python train.py --env Walker2d-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/ + +python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64, "model": {"free_logstd": true}}' --alg PolicyGradient --upload-dir s3://bucketname/ + python train.py --env PongNoFrameskip-v0 --alg DQN --upload-dir s3://bucketname/ python train.py --env PongDeterministic-v0 --alg A3C --upload-dir s3://bucketname/ python train.py --env Humanoid-v1 --alg EvolutionStrategies --upload-dir s3://bucketname/