mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 21:38:18 +08:00
[rllib] Tune ppo more on control tasks (#777)
* tune ppo on control tasks * introduce free log_std * fix * flag for writing logs * fixes * fixes
This commit is contained in:
committed by
Robert Nishihara
parent
99badc7ae4
commit
df65e87fc7
@@ -26,6 +26,7 @@ class ModelCatalog(object):
|
||||
|
||||
Args:
|
||||
action_space (Space): Action space of the target gym env.
|
||||
dist_type (Optional[str]): Identifier of the action distribution.
|
||||
|
||||
Returns:
|
||||
dist_class (ActionDistribution): Python class of the distribution.
|
||||
|
||||
@@ -19,13 +19,26 @@ def normc_initializer(std=1.0):
|
||||
|
||||
|
||||
class FullyConnectedNetwork(Model):
|
||||
"""Generic fully connected network."""
|
||||
"""Generic fully connected network.
|
||||
|
||||
Options to construct the network are passed to the _init function.
|
||||
If options["free_logstd"] is True, the last half of the
|
||||
output layer will be free variables that are not dependent on
|
||||
inputs. This is often used if the output of the network is used
|
||||
to parametrize a probability distribution. In this case, the
|
||||
first half of the parameters can be interpreted as a location
|
||||
parameter (like a mean) and the second half can be interpreted as
|
||||
a scale parameter (like a standard deviation).
|
||||
"""
|
||||
|
||||
def _init(self, inputs, num_outputs, options):
|
||||
hiddens = options.get("fcnet_hiddens", [256, 256])
|
||||
activation = options.get("fcnet_activation", tf.nn.tanh)
|
||||
print("Constructing fcnet {} {}".format(hiddens, activation))
|
||||
|
||||
if options.get("free_logstd", False):
|
||||
num_outputs = num_outputs // 2
|
||||
|
||||
with tf.name_scope("fc_net"):
|
||||
i = 1
|
||||
last_layer = inputs
|
||||
@@ -40,4 +53,8 @@ class FullyConnectedNetwork(Model):
|
||||
last_layer, num_outputs,
|
||||
weights_initializer=normc_initializer(0.01),
|
||||
activation_fn=None, scope="fc_out")
|
||||
if options.get("free_logstd", False):
|
||||
logstd = tf.get_variable(name="logstd", shape=[num_outputs],
|
||||
initializer=tf.zeros_initializer)
|
||||
output = tf.concat([output, 0.0 * output + logstd], 1)
|
||||
return output, last_layer
|
||||
|
||||
@@ -22,7 +22,7 @@ class ProximalPolicyLoss(object):
|
||||
self.observations = observations
|
||||
|
||||
self.curr_logits = ModelCatalog.get_model(
|
||||
observations, logit_dim).outputs
|
||||
observations, logit_dim, config["model"]).outputs
|
||||
self.curr_dist = distribution_class(self.curr_logits)
|
||||
self.sampler = self.curr_dist.sample()
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ from ray.rllib.policy_gradient.utils import shuffle
|
||||
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
"gamma": 0.995,
|
||||
"kl_coeff": 0.2,
|
||||
"num_sgd_iter": 30,
|
||||
"max_iterations": 1000,
|
||||
@@ -34,11 +35,13 @@ DEFAULT_CONFIG = {
|
||||
"entropy_coeff": 0.0,
|
||||
"clip_param": 0.3,
|
||||
"kl_target": 0.01,
|
||||
"model": {"free_logstd": False},
|
||||
"timesteps_per_batch": 40000,
|
||||
"num_agents": 5,
|
||||
"full_trace_nth_sgd_batch": -1,
|
||||
"full_trace_data_load": False,
|
||||
"use_tf_debugger": False,
|
||||
"write_logs": True, # write checkpoints and tensorflow logging?
|
||||
"model_checkpoint_file": "iteration-%s.ckpt"}
|
||||
|
||||
|
||||
@@ -53,7 +56,9 @@ class PolicyGradient(Algorithm):
|
||||
preprocessor = AtariPixelPreprocessor()
|
||||
elif self.env_name == "Pong-ram-v3":
|
||||
preprocessor = AtariRamPreprocessor()
|
||||
elif self.env_name == "CartPole-v0":
|
||||
elif self.env_name == "CartPole-v0" or self.env_name == "CartPole-v1":
|
||||
preprocessor = NoPreprocessor()
|
||||
elif self.env_name == "Hopper-v1":
|
||||
preprocessor = NoPreprocessor()
|
||||
elif self.env_name == "Walker2d-v1":
|
||||
preprocessor = NoPreprocessor()
|
||||
@@ -82,12 +87,14 @@ class PolicyGradient(Algorithm):
|
||||
j = self.j
|
||||
self.j += 1
|
||||
|
||||
print("===> iteration", self.j)
|
||||
|
||||
saver = tf.train.Saver(max_to_keep=None)
|
||||
if "load_checkpoint" in config:
|
||||
saver.restore(model.sess, config["load_checkpoint"])
|
||||
|
||||
# TF does not support to write logs to S3 at the moment
|
||||
write_tf_logs = self.logdir.startswith("file")
|
||||
write_tf_logs = config["write_logs"] and self.logdir.startswith("file")
|
||||
iter_start = time.time()
|
||||
if write_tf_logs:
|
||||
file_writer = tf.summary.FileWriter(self.logdir, model.sess.graph)
|
||||
@@ -101,7 +108,7 @@ class PolicyGradient(Algorithm):
|
||||
weights = ray.put(model.get_weights())
|
||||
[a.load_weights.remote(weights) for a in agents]
|
||||
trajectory, total_reward, traj_len_mean = collect_samples(
|
||||
agents, config["timesteps_per_batch"], 0.995, 1.0, 2000)
|
||||
agents, config["timesteps_per_batch"], config["gamma"], 1.0, 2000)
|
||||
print("total reward is ", total_reward)
|
||||
print("trajectory length mean is ", traj_len_mean)
|
||||
print("timesteps:", trajectory["dones"].shape[0])
|
||||
|
||||
@@ -1,7 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
python train.py --env Walker2d-v1 --alg PolicyGradient --upload-dir s3://bucketname/
|
||||
python train.py --env Humanoid-v1 --alg PolicyGradient --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64}' --upload-dir s3://bucketname/
|
||||
python train.py --env Hopper-v1 --config '{"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
|
||||
|
||||
python train.py --env CartPole-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
|
||||
|
||||
python train.py --env Walker2d-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64}' --alg PolicyGradient --upload-dir s3://bucketname/
|
||||
|
||||
python train.py --env Humanoid-v1 --config '{"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_agents": 64, "model": {"free_logstd": true}}' --alg PolicyGradient --upload-dir s3://bucketname/
|
||||
|
||||
python train.py --env PongNoFrameskip-v0 --alg DQN --upload-dir s3://bucketname/
|
||||
python train.py --env PongDeterministic-v0 --alg A3C --upload-dir s3://bucketname/
|
||||
python train.py --env Humanoid-v1 --alg EvolutionStrategies --upload-dir s3://bucketname/
|
||||
|
||||
Reference in New Issue
Block a user