Files
Run-Skeleton-Run/baselines/trajectories.py
T
Kolesnikov Sergey 7401266fe7 pytorch version
2017-11-15 22:18:46 +03:00

77 lines
2.7 KiB
Python

import numpy as np
def traj_segment_generator(pi, env, args, horizon, stochastic):
# Initialize state variables
t = 0
ac = env.action_space.sample() # not used, just so we have the datatype
new = True # marks if we're on first timestep of an episode
ob = env.reset(difficulty=args.difficulty)
cur_ep_ret = 0 # return in current episode
cur_ep_len = 0 # len of current episode
ep_rets = [] # returns of completed episodes in this segment
ep_lens = [] # lengths of ...
# Initialize history arrays
obs = np.array([ob for _ in range(horizon)])
rews = np.zeros(horizon, 'float32')
vpreds = np.zeros(horizon, 'float32')
news = np.zeros(horizon, 'int32')
acs = np.array([ac for _ in range(horizon)])
prevacs = acs.copy()
while True:
prevac = ac
ac, vpred = pi.act(stochastic, ob)
# Slight weirdness here because we need value function at time T
# before returning segment [0, T-1] so we get the correct
# terminal value
if t > 0 and t % horizon == 0:
yield {"ob": obs, "rew": rews, "vpred": vpreds, "new": news,
"ac": acs, "prevac": prevacs, "nextvpred": vpred * (1 - new),
"ep_rets": ep_rets, "ep_lens": ep_lens}
# @TODO: TRPO & PPO implementation diff
# _, vpred = pi.act(stochastic, ob) # @TODO: uncomment??? IMPORTANT!!
# Be careful!!! if you change the downstream algorithm to aggregate
# several of these batches, then be sure to do a deepcopy
ep_rets = []
ep_lens = []
i = t % horizon
obs[i] = ob
vpreds[i] = vpred
news[i] = new
acs[i] = ac
prevacs[i] = prevac
ob, rew, new, _ = env.step(ac)
rews[i] = rew
cur_ep_ret += rew
cur_ep_len += 1
if new:
ep_rets.append(cur_ep_ret)
ep_lens.append(cur_ep_len)
cur_ep_ret = 0
cur_ep_len = 0
ob = env.reset(difficulty=args.difficulty)
t += 1
def add_vtarg_and_adv(seg, gamma, lam):
"""
Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
"""
# last element is only used for last vtarg, but we already zeroed it if last new = 1
new = np.append(seg["new"], 0)
vpred = np.append(seg["vpred"], seg["nextvpred"])
T = len(seg["rew"])
seg["adv"] = gaelam = np.empty(T, 'float32')
rew = seg["rew"]
lastgaelam = 0
for t in reversed(range(T)):
nonterminal = 1 - new[t + 1]
delta = rew[t] + gamma * vpred[t + 1] * nonterminal - vpred[t]
gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
seg["tdlamret"] = seg["adv"] + seg["vpred"]