Add MuJoCo
@@ -1,5 +1,7 @@
|
||||
# pytorch-a2c-ppo-acktr
|
||||
|
||||
## Update 09/27/2017: now supports both Atari and MuJoCo/Roboschool!
|
||||
|
||||
This is a PyTorch implementation of
|
||||
* Advantage Actor Critic (A2C), a synchronous deterministic version of [A3C](https://arxiv.org/pdf/1602.01783v1.pdf)
|
||||
* Proximal Policy Optimization [PPO](https://arxiv.org/pdf/1707.06347.pdf)
|
||||
@@ -13,47 +15,85 @@ This implementation is inspired by the OpenAI baselines for [A2C](https://github
|
||||
|
||||
Contributions are very welcome. If you know how to make this code better, don't hesitate to send a pull request. Also see a todo list below.
|
||||
|
||||
Also I'm searching for volunteers to run all experiments on Atari and MuJoCo (with multiple random seeds).
|
||||
|
||||
## Disclaimer
|
||||
|
||||
It's extremely difficult to reproduce results for Reinforcement Learning methods. See ["Deep Reinforcement Learning that Matters"](https://arxiv.org/abs/1709.06560) for more information. I tried to reproduce OpenAI results as closely as possible. However, majors differences in performance can be caused even by minor differences in TensorFlow and PyTorch libraries.
|
||||
|
||||
### TODO
|
||||
* Add MuJoCo and continuous actions
|
||||
* Improve this README file. Rearrange images.
|
||||
* Improve performance of KFAC, see kfac.py for more information
|
||||
* Run evaluation for all games and algorithms
|
||||
|
||||
## Usage
|
||||
|
||||
### A2C
|
||||
### Atari
|
||||
#### A2C
|
||||
|
||||
```
|
||||
python main.py --env-name "PongNoFrameskip-v4"
|
||||
```
|
||||
|
||||
### PPO
|
||||
#### PPO
|
||||
|
||||
```
|
||||
python main.py --env-name "PongNoFrameskip-v4" --algo ppo --use-gae --num-processes 8 --num-steps 256 --vis-interval 1 --log-interval 1
|
||||
```
|
||||
|
||||
### ACKTR
|
||||
#### ACKTR
|
||||
|
||||
```
|
||||
python main.py --env-name "PongNoFrameskip-v4" --algo acktr --num-processes 32 --num-steps 20
|
||||
```
|
||||
|
||||
### MuJoCo
|
||||
#### A2C
|
||||
|
||||
```
|
||||
python main.py --env-name "Reacher-v1" --num-stack 1 --num-frames 1000000
|
||||
```
|
||||
|
||||
#### PPO
|
||||
|
||||
```
|
||||
python main.py --env-name "Reacher-v1" --algo ppo --use-gae --vis-interval 1 --log-interval 1 --num-stack 1 --num-steps 2048 --num-processes 1 --lr 3e-4 --entropy-coef 0 --ppo-epoch 10 --batch-size 64 --gamma 0.99 --tau 0.95 --num-frames 1000000
|
||||
```
|
||||
|
||||
#### ACKTR
|
||||
|
||||
ACKTR requires some modifications to be made specifically for MuJoCo. But at the moment, I want to keep this code as unified as possible. Thus, I'm going for better ways to integrate it into the codebase.
|
||||
|
||||
## Results
|
||||
|
||||
### A2C
|
||||
|
||||

|
||||

|
||||
|
||||

|
||||

|
||||
|
||||

|
||||

|
||||
|
||||

|
||||

|
||||
|
||||
### PPO
|
||||
|
||||
Coming soon.
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
|
||||
### ACKTR
|
||||
|
||||
Coming soon.
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
@@ -4,7 +4,7 @@ import gym
|
||||
from gym.spaces.box import Box
|
||||
|
||||
from baselines import bench
|
||||
from baselines.common.atari_wrappers import *
|
||||
from baselines.common.atari_wrappers import wrap_deepmind
|
||||
|
||||
|
||||
def make_env(env_id, seed, rank, log_dir):
|
||||
@@ -14,8 +14,10 @@ def make_env(env_id, seed, rank, log_dir):
|
||||
env = bench.Monitor(env,
|
||||
os.path.join(log_dir,
|
||||
"{}.monitor.json".format(rank)))
|
||||
env = wrap_deepmind(env)
|
||||
env = WrapPyTorch(env)
|
||||
# Ugly hack to detect atari.
|
||||
if env.action_space.__class__.__name__ == 'Discrete':
|
||||
env = wrap_deepmind(env)
|
||||
env = WrapPyTorch(env)
|
||||
return env
|
||||
|
||||
return _thunk
|
||||
|
||||
|
Before Width: | Height: | Size: 22 KiB After Width: | Height: | Size: 22 KiB |
|
Before Width: | Height: | Size: 24 KiB After Width: | Height: | Size: 24 KiB |
|
Before Width: | Height: | Size: 25 KiB After Width: | Height: | Size: 25 KiB |
|
Before Width: | Height: | Size: 20 KiB After Width: | Height: | Size: 20 KiB |
|
After Width: | Height: | Size: 21 KiB |
|
After Width: | Height: | Size: 19 KiB |
|
After Width: | Height: | Size: 20 KiB |
|
After Width: | Height: | Size: 20 KiB |
|
After Width: | Height: | Size: 18 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 17 KiB |
|
After Width: | Height: | Size: 20 KiB |
@@ -11,8 +11,6 @@ import torch.optim as optim
|
||||
|
||||
|
||||
def _extract_patches(x, kernel_size, stride, padding):
|
||||
#result = P.im2col(Variable(x), kernel_size, stride, padding).data
|
||||
#return result.view(result.size(0), -1, result.size(-2), result.size(-1))
|
||||
if padding[0] + padding[1] > 0:
|
||||
x = F.pad(x, (padding[1], padding[1], padding[0],
|
||||
padding[0])).data # Actually check dims
|
||||
@@ -164,7 +162,6 @@ class KFACOptimizer(optim.Optimizer):
|
||||
raise NotImplementedError(
|
||||
'Layer {} is not supported'.format(classname))
|
||||
|
||||
#@profile
|
||||
def step(self):
|
||||
# Add weight decay
|
||||
if self.weight_decay > 0:
|
||||
@@ -187,10 +184,12 @@ class KFACOptimizer(optim.Optimizer):
|
||||
self.m_aa[m].cpu().double(), eigenvectors=True)
|
||||
self.d_g[m], self.Q_g[m] = torch.symeig(
|
||||
self.m_gg[m].cpu().double(), eigenvectors=True)
|
||||
self.d_a[m], self.Q_a[m] = self.d_a[
|
||||
m].float().cuda(), self.Q_a[m].float().cuda()
|
||||
self.d_g[m], self.Q_g[m] = self.d_g[
|
||||
m].float().cuda(), self.Q_g[m].float().cuda()
|
||||
self.d_a[m], self.Q_a[m] = self.d_a[m].float(), self.Q_a[m].float()
|
||||
self.d_g[m], self.Q_g[m] = self.d_g[m].float(), self.Q_g[m].float()
|
||||
if self.m_aa[m].is_cuda:
|
||||
self.d_a[m], self.Q_a[m] = self.d_a[m].cuda(), self.Q_a[m].cuda()
|
||||
self.d_g[m], self.Q_g[m] = self.d_g[m].cuda(), self.Q_g[m].cuda()
|
||||
|
||||
self.d_a[m].mul_((self.d_a[m] > 1e-6).float())
|
||||
self.d_g[m].mul_((self.d_g[m] > 1e-6).float())
|
||||
|
||||
|
||||
@@ -15,9 +15,9 @@ from arguments import get_args
|
||||
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
|
||||
from envs import make_env
|
||||
from kfac import KFACOptimizer
|
||||
from model import ActorCritic
|
||||
from model import CNNPolicy, MLPPolicy
|
||||
from storage import RolloutStorage
|
||||
from vizualize_atari import visdom_plot
|
||||
from visualize import visdom_plot
|
||||
|
||||
args = get_args()
|
||||
|
||||
@@ -59,7 +59,12 @@ def main():
|
||||
obs_shape = envs.observation_space.shape
|
||||
obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
|
||||
|
||||
actor_critic = ActorCritic(obs_shape[0], envs.action_space)
|
||||
if envs.action_space.__class__.__name__ == 'Discrete':
|
||||
actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
|
||||
action_shape = 1
|
||||
else:
|
||||
actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
|
||||
action_shape = envs.action_space.shape[0]
|
||||
|
||||
if args.cuda:
|
||||
actor_critic.cuda()
|
||||
@@ -71,13 +76,15 @@ def main():
|
||||
elif args.algo == 'acktr':
|
||||
optimizer = KFACOptimizer(actor_critic)
|
||||
|
||||
rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space.n)
|
||||
|
||||
rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space)
|
||||
current_state = torch.zeros(args.num_processes, *obs_shape)
|
||||
|
||||
def update_current_state(state):
|
||||
state = torch.from_numpy(np.stack(state)).float()
|
||||
current_state[:, :-1] = current_state[:, 1:]
|
||||
current_state[:, -1] = state
|
||||
shape_dim0 = envs.observation_space.shape[0]
|
||||
state = torch.from_numpy(state).float()
|
||||
if args.num_stack > 1:
|
||||
current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
|
||||
current_state[:, -shape_dim0:] = state
|
||||
|
||||
state = envs.reset()
|
||||
update_current_state(state)
|
||||
@@ -103,7 +110,6 @@ def main():
|
||||
|
||||
# Obser reward and next state
|
||||
state, reward, done, info = envs.step(cpu_actions)
|
||||
|
||||
reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
|
||||
episode_rewards += reward
|
||||
|
||||
@@ -115,17 +121,24 @@ def main():
|
||||
|
||||
if args.cuda:
|
||||
masks = masks.cuda()
|
||||
current_state *= masks.unsqueeze(2).unsqueeze(2)
|
||||
|
||||
if current_state.dim() == 4:
|
||||
current_state *= masks.unsqueeze(2).unsqueeze(2)
|
||||
else:
|
||||
current_state *= masks
|
||||
|
||||
update_current_state(state)
|
||||
rollouts.insert(step, current_state, action.data, value.data, reward, masks)
|
||||
|
||||
next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data
|
||||
|
||||
if hasattr(actor_critic, 'obs_filter'):
|
||||
actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))
|
||||
|
||||
rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
|
||||
|
||||
if args.algo in ['a2c', 'acktr']:
|
||||
values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, 1)))
|
||||
values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape)))
|
||||
|
||||
values = values.view(args.num_steps, args.num_processes, 1)
|
||||
action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)
|
||||
@@ -164,6 +177,8 @@ def main():
|
||||
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)
|
||||
|
||||
old_model.load_state_dict(actor_critic.state_dict())
|
||||
if hasattr(actor_critic, 'obs_filter'):
|
||||
old_model.obs_filter = actor_critic.obs_filter
|
||||
|
||||
for _ in range(args.ppo_epoch):
|
||||
sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False)
|
||||
@@ -171,8 +186,8 @@ def main():
|
||||
indices = torch.LongTensor(indices)
|
||||
if args.cuda:
|
||||
indices = indices.cuda()
|
||||
states_batch = rollouts.states[:-1].view(-1, *rollouts.states.size()[-3:])[indices]
|
||||
actions_batch = rollouts.actions.view(-1, 1)[indices]
|
||||
states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices]
|
||||
actions_batch = rollouts.actions.view(-1, action_shape)[indices]
|
||||
return_batch = rollouts.returns[:-1].view(-1, 1)[indices]
|
||||
|
||||
# Reshape to do in a single forward pass for all steps
|
||||
@@ -183,7 +198,7 @@ def main():
|
||||
ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data))
|
||||
adv_targ = Variable(advantages.view(-1, 1)[indices])
|
||||
surr1 = ratio * adv_targ
|
||||
surr2 = ratio.clamp(1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
|
||||
surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
|
||||
action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)
|
||||
|
||||
value_loss = (Variable(return_batch) - values).pow(2).mean()
|
||||
|
||||
@@ -3,6 +3,8 @@ import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.autograd import Variable
|
||||
from running_stat import ObsNorm
|
||||
|
||||
|
||||
def weights_init(m):
|
||||
@@ -28,9 +30,9 @@ class AddBias(nn.Module):
|
||||
return x + bias
|
||||
|
||||
|
||||
class ActorCritic(torch.nn.Module):
|
||||
class CNNPolicy(torch.nn.Module):
|
||||
def __init__(self, num_inputs, action_space):
|
||||
super(ActorCritic, self).__init__()
|
||||
super(CNNPolicy, self).__init__()
|
||||
self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4, bias=False)
|
||||
self.ab1 = AddBias(32)
|
||||
self.conv2 = nn.Conv2d(32, 64, 4, stride=2, bias=False)
|
||||
@@ -41,19 +43,20 @@ class ActorCritic(torch.nn.Module):
|
||||
self.linear1 = nn.Linear(32 * 7 * 7, 512, bias=False)
|
||||
self.ab_fc1 = AddBias(512)
|
||||
|
||||
num_outputs = action_space.n
|
||||
self.critic_linear = nn.Linear(512, 1, bias=False)
|
||||
self.ab_fc2 = AddBias(1)
|
||||
|
||||
num_outputs = action_space.n
|
||||
self.actor_linear = nn.Linear(512, num_outputs, bias=False)
|
||||
self.ab_fc3 = AddBias(num_outputs)
|
||||
|
||||
self.apply(weights_init)
|
||||
|
||||
self.conv1.weight.data.mul_(math.sqrt(2)) # Multiplier for relu
|
||||
self.conv2.weight.data.mul_(math.sqrt(2)) # Multiplier for relu
|
||||
self.conv3.weight.data.mul_(math.sqrt(2)) # Multiplier for relu
|
||||
self.linear1.weight.data.mul_(math.sqrt(2)) # Multiplier for relu
|
||||
relu_gain = nn.init.calculate_gain('relu')
|
||||
self.conv1.weight.data.mul_(relu_gain)
|
||||
self.conv2.weight.data.mul_(relu_gain)
|
||||
self.conv3.weight.data.mul_(relu_gain)
|
||||
self.linear1.weight.data.mul_(relu_gain)
|
||||
|
||||
self.train()
|
||||
|
||||
@@ -97,3 +100,112 @@ class ActorCritic(torch.nn.Module):
|
||||
dist_entropy = -(log_probs * probs).sum(-1).mean()
|
||||
|
||||
return values, action_log_probs, dist_entropy
|
||||
|
||||
|
||||
def weights_init_mlp(m):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find('Linear') != -1:
|
||||
m.weight.data.normal_(0, 1)
|
||||
m.weight.data *= 1 / torch.sqrt(m.weight.data.pow(2).sum(1, keepdim=True))
|
||||
if m.bias is not None:
|
||||
m.bias.data.fill_(0)
|
||||
|
||||
class MLPPolicy(torch.nn.Module):
|
||||
def __init__(self, num_inputs, action_space):
|
||||
super(MLPPolicy, self).__init__()
|
||||
|
||||
self.obs_filter = ObsNorm((1, num_inputs), clip=5)
|
||||
self.action_space = action_space
|
||||
|
||||
self.a_fc1 = nn.Linear(num_inputs, 64, bias=False)
|
||||
self.a_ab1 = AddBias(64)
|
||||
self.a_fc2 = nn.Linear(64, 64, bias=False)
|
||||
self.a_ab2 = AddBias(64)
|
||||
self.a_fc_mean = nn.Linear(64, action_space.shape[0], bias=False)
|
||||
self.a_ab_mean = AddBias(action_space.shape[0])
|
||||
self.a_ab_logstd = AddBias(action_space.shape[0])
|
||||
|
||||
self.v_fc1 = nn.Linear(num_inputs, 64, bias=False)
|
||||
self.v_ab1 = AddBias(64)
|
||||
self.v_fc2 = nn.Linear(64, 64, bias=False)
|
||||
self.v_ab2 = AddBias(64)
|
||||
self.v_fc3 = nn.Linear(64, 1, bias=False)
|
||||
self.v_ab3 = AddBias(1)
|
||||
|
||||
self.apply(weights_init_mlp)
|
||||
|
||||
tanh_gain = nn.init.calculate_gain('tanh')
|
||||
#self.a_fc1.weight.data.mul_(tanh_gain)
|
||||
#self.a_fc2.weight.data.mul_(tanh_gain)
|
||||
self.a_fc_mean.weight.data.mul_(0.01)
|
||||
#self.v_fc1.weight.data.mul_(tanh_gain)
|
||||
#self.v_fc2.weight.data.mul_(tanh_gain)
|
||||
|
||||
self.train()
|
||||
|
||||
def cuda(self, **args):
|
||||
super(MLPPolicy, self).cuda(**args)
|
||||
self.obs_filter.cuda()
|
||||
|
||||
def forward(self, inputs):
|
||||
inputs.data = self.obs_filter(inputs.data)
|
||||
|
||||
x = self.v_fc1(inputs)
|
||||
x = self.v_ab1(x)
|
||||
x = F.tanh(x)
|
||||
|
||||
x = self.v_fc2(x)
|
||||
x = self.v_ab2(x)
|
||||
x = F.tanh(x)
|
||||
|
||||
x = self.v_fc3(x)
|
||||
x = self.v_ab3(x)
|
||||
value = x
|
||||
|
||||
x = self.a_fc1(inputs)
|
||||
x = self.a_ab1(x)
|
||||
x = F.tanh(x)
|
||||
|
||||
x = self.a_fc2(x)
|
||||
x = self.a_ab2(x)
|
||||
x = F.tanh(x)
|
||||
|
||||
x = self.a_fc_mean(x)
|
||||
x = self.a_ab_mean(x)
|
||||
action_mean = x
|
||||
|
||||
# An ugly hack for my KFAC implementation.
|
||||
zeros = Variable(torch.zeros(x.size()), volatile=x.volatile)
|
||||
if x.is_cuda:
|
||||
zeros = zeros.cuda()
|
||||
|
||||
x = self.a_ab_logstd(zeros)
|
||||
action_logstd = x
|
||||
|
||||
return value, action_mean, action_logstd
|
||||
|
||||
def act(self, inputs):
|
||||
value, action_mean, action_logstd = self(inputs)
|
||||
|
||||
action_std = action_logstd.exp()
|
||||
|
||||
noise = Variable(torch.randn(action_std.size()))
|
||||
if action_std.is_cuda:
|
||||
noise = noise.cuda()
|
||||
|
||||
action = action_mean + action_std * noise
|
||||
return value, action
|
||||
|
||||
def evaluate_actions(self, inputs, actions):
|
||||
assert inputs.dim() == 2, "Expect to have inputs in num_processes * num_steps x ... format"
|
||||
|
||||
value, action_mean, action_logstd = self(inputs)
|
||||
|
||||
action_std = action_logstd.exp()
|
||||
|
||||
action_log_probs = -0.5 * ((actions - action_mean) / action_std).pow(2) - 0.5 * math.log(2 * math.pi) - action_logstd
|
||||
action_log_probs = action_log_probs.sum(1, keepdim=True)
|
||||
dist_entropy = 0.5 + math.log(2 * math.pi) + action_log_probs
|
||||
dist_entropy = dist_entropy.sum(-1).mean()
|
||||
|
||||
return value, action_log_probs, dist_entropy
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
import random
|
||||
|
||||
import torch
|
||||
|
||||
class ObsNorm(object):
|
||||
def __init__(self, shape, demean=True, destd=True, clip=10.0):
|
||||
self.demean = demean
|
||||
self.destd = destd
|
||||
self.clip = clip
|
||||
|
||||
self.count = torch.zeros(1).double() + 1e-2
|
||||
self.sum = torch.zeros(shape).double()
|
||||
self.sum_sqr = torch.zeros(shape).double() + 1e-2
|
||||
|
||||
self.mean = torch.zeros(shape)
|
||||
self.std = torch.ones(shape)
|
||||
|
||||
def cuda(self):
|
||||
self.count = self.count.cuda()
|
||||
self.sum = self.sum.cuda()
|
||||
self.sum_sqr = self.sum_sqr.cuda()
|
||||
|
||||
self.mean = self.mean.cuda()
|
||||
self.std = self.std.cuda()
|
||||
|
||||
def update(self, x):
|
||||
self.count += x.size(0)
|
||||
self.sum += x.sum(0, keepdim=True).double()
|
||||
self.sum_sqr += x.pow(2).sum(0, keepdim=True).double()
|
||||
|
||||
self.mean = self.sum / self.count
|
||||
self.std = (self.sum_sqr / self.count - self.mean.pow(2)).clamp(1e-2, 1e9).sqrt()
|
||||
|
||||
self.mean = self.mean.float()
|
||||
self.std = self.std.float()
|
||||
|
||||
def __call__(self, x):
|
||||
if self.demean:
|
||||
x = x - self.mean
|
||||
if self.destd:
|
||||
x = x / self.std
|
||||
if self.clip:
|
||||
x = x.clamp(-self.clip, self.clip)
|
||||
return x
|
||||
@@ -2,13 +2,19 @@ import torch
|
||||
|
||||
|
||||
class RolloutStorage(object):
|
||||
def __init__(self, num_steps, num_processes, obs_shape, action_shape):
|
||||
def __init__(self, num_steps, num_processes, obs_shape, action_space):
|
||||
self.states = torch.zeros(num_steps + 1, num_processes, *obs_shape)
|
||||
self.rewards = torch.zeros(num_steps, num_processes, 1)
|
||||
self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
|
||||
self.returns = torch.zeros(num_steps + 1, num_processes, 1)
|
||||
self.actions = torch.LongTensor(num_steps, num_processes, 1)
|
||||
self.masks = torch.zeros(num_steps, num_processes, 1)
|
||||
if action_space.__class__.__name__ == 'Discrete':
|
||||
action_shape = 1
|
||||
else:
|
||||
action_shape = action_space.shape[0]
|
||||
self.actions = torch.zeros(num_steps, num_processes, action_shape)
|
||||
if action_space.__class__.__name__ == 'Discrete':
|
||||
self.actions = self.actions.long()
|
||||
self.masks = torch.ones(num_steps + 1, num_processes, 1)
|
||||
|
||||
def cuda(self):
|
||||
self.states = self.states.cuda()
|
||||
@@ -30,8 +36,7 @@ class RolloutStorage(object):
|
||||
self.value_preds[-1] = next_value
|
||||
gae = 0
|
||||
for step in reversed(range(self.rewards.size(0))):
|
||||
delta = self.rewards[step] + gamma * self.value_preds[step +
|
||||
1] * self.masks[step] - self.value_preds[step]
|
||||
delta = self.rewards[step] + gamma * self.value_preds[step + 1] * self.masks[step] - self.value_preds[step]
|
||||
gae = delta + gamma * tau * self.masks[step] * gae
|
||||
self.returns[step] = gae + self.value_preds[step]
|
||||
else:
|
||||
|
||||
@@ -106,12 +106,20 @@ def visdom_plot(viz, win, folder, game, name, bin_size=100, smooth=1):
|
||||
|
||||
fig = plt.figure()
|
||||
plt.plot(tx, ty, label="{}".format(name))
|
||||
plt.xticks([4*1e6, 4*2e6, 4*4e6, 4*6e6, 4*8e6, 4*10e6],
|
||||
["1M", "2M", "4M", "6M", "8M", "10M"])
|
||||
|
||||
# Ugly hack to detect atari
|
||||
if game.find('NoFrameskip') > -1:
|
||||
plt.xticks([4*1e6, 4*2e6, 4*4e6, 4*6e6, 4*8e6, 4*10e6],
|
||||
["1M", "2M", "4M", "6M", "8M", "10M"])
|
||||
plt.xlim(0, 40e6)
|
||||
else:
|
||||
plt.xticks([1e5, 2e5, 4e5, 6e5, 8e5, 1e5],
|
||||
["0.1M", "0.2M", "0.4M", "0.6M", "0.8M", "1M"])
|
||||
plt.xlim(0, 1e6)
|
||||
|
||||
plt.xlabel('Number of Timesteps')
|
||||
plt.ylabel('Rewards')
|
||||
|
||||
plt.xlim(0, 40e6)
|
||||
|
||||
plt.title(game)
|
||||
plt.legend(loc=4)
|
||||
@@ -130,5 +138,4 @@ def visdom_plot(viz, win, folder, game, name, bin_size=100, smooth=1):
|
||||
if __name__ == "__main__":
|
||||
from visdom import Visdom
|
||||
viz = Visdom()
|
||||
visdom_plot(
|
||||
viz, None, '/tmp/gym/', 'BreakOut', 'a2c', bin_size=100, smooth=1)
|
||||
visdom_plot(viz, None, '/tmp/gym/', 'BreakOut', 'a2c', bin_size=100, smooth=1)
|
||||