mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 22:20:31 +08:00
[rllib] A3C Refactoring (#1166)
* fixing policy * Compute Action is singular, fixed weird issue with arrays * remove vestige * extraneous ipdb * Can Drop in Pytorch Model * lint * naming * finish comments
This commit is contained in:
@@ -4,14 +4,12 @@ from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pickle
|
||||
import tensorflow as tf
|
||||
import six.moves.queue as queue
|
||||
import os
|
||||
|
||||
import ray
|
||||
from ray.rllib.agent import Agent
|
||||
from ray.rllib.a3c.runner import RunnerThread, process_rollout
|
||||
from ray.rllib.a3c.envs import create_and_wrap
|
||||
from ray.rllib.a3c.runner import RemoteRunner
|
||||
from ray.rllib.a3c.shared_model import SharedModel
|
||||
from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM
|
||||
from ray.tune.result import TrainingResult
|
||||
@@ -24,76 +22,11 @@ DEFAULT_CONFIG = {
|
||||
"use_lstm": True,
|
||||
"model": {"grayscale": True,
|
||||
"zero_mean": False,
|
||||
"dim": 42}
|
||||
"dim": 42,
|
||||
"channel_major": True}
|
||||
}
|
||||
|
||||
|
||||
@ray.remote
|
||||
class Runner(object):
|
||||
"""Actor object to start running simulation on workers.
|
||||
|
||||
The gradient computation is also executed from this object.
|
||||
"""
|
||||
def __init__(self, env_creator, policy_cls, actor_id, batch_size,
|
||||
preprocess_config, logdir):
|
||||
env = create_and_wrap(env_creator, preprocess_config)
|
||||
self.id = actor_id
|
||||
# TODO(rliaw): should change this to be just env.observation_space
|
||||
self.policy = policy_cls(env.observation_space.shape, env.action_space)
|
||||
self.runner = RunnerThread(env, self.policy, batch_size)
|
||||
self.env = env
|
||||
self.logdir = logdir
|
||||
self.start()
|
||||
|
||||
def pull_batch_from_queue(self):
|
||||
"""Take a rollout from the queue of the thread runner."""
|
||||
rollout = self.runner.queue.get(timeout=600.0)
|
||||
if isinstance(rollout, BaseException):
|
||||
raise rollout
|
||||
while not rollout.terminal:
|
||||
try:
|
||||
part = self.runner.queue.get_nowait()
|
||||
if isinstance(part, BaseException):
|
||||
raise rollout
|
||||
rollout.extend(part)
|
||||
except queue.Empty:
|
||||
break
|
||||
return rollout
|
||||
|
||||
def get_completed_rollout_metrics(self):
|
||||
"""Returns metrics on previously completed rollouts.
|
||||
|
||||
Calling this clears the queue of completed rollout metrics.
|
||||
"""
|
||||
completed = []
|
||||
while True:
|
||||
try:
|
||||
completed.append(self.runner.metrics_queue.get_nowait())
|
||||
except queue.Empty:
|
||||
break
|
||||
return completed
|
||||
|
||||
def start(self):
|
||||
summary_writer = tf.summary.FileWriter(
|
||||
os.path.join(self.logdir, "agent_%d" % self.id))
|
||||
self.summary_writer = summary_writer
|
||||
self.runner.start_runner(self.policy.sess, summary_writer)
|
||||
|
||||
def compute_gradient(self, params):
|
||||
self.policy.set_weights(params)
|
||||
rollout = self.pull_batch_from_queue()
|
||||
batch = process_rollout(rollout, gamma=0.99, lambda_=1.0)
|
||||
gradient, info = self.policy.get_gradients(batch)
|
||||
if "summary" in info:
|
||||
self.summary_writer.add_summary(
|
||||
tf.Summary.FromString(info['summary']),
|
||||
self.policy.local_steps)
|
||||
self.summary_writer.flush()
|
||||
info = {"id": self.id,
|
||||
"size": len(batch.a)}
|
||||
return gradient, info
|
||||
|
||||
|
||||
class A3CAgent(Agent):
|
||||
_agent_name = "A3C"
|
||||
_default_config = DEFAULT_CONFIG
|
||||
@@ -107,9 +40,9 @@ class A3CAgent(Agent):
|
||||
self.policy = policy_cls(
|
||||
self.env.observation_space.shape, self.env.action_space)
|
||||
self.agents = [
|
||||
Runner.remote(self.env_creator, policy_cls, i,
|
||||
self.config["batch_size"],
|
||||
self.config["model"], self.logdir)
|
||||
RemoteRunner.remote(self.env_creator, policy_cls, i,
|
||||
self.config["batch_size"],
|
||||
self.config["model"], self.logdir)
|
||||
for i in range(self.config["num_workers"])]
|
||||
self.parameters = self.policy.get_weights()
|
||||
|
||||
@@ -122,7 +55,7 @@ class A3CAgent(Agent):
|
||||
while gradient_list:
|
||||
done_id, gradient_list = ray.wait(gradient_list)
|
||||
gradient, info = ray.get(done_id)[0]
|
||||
self.policy.model_update(gradient)
|
||||
self.policy.apply_gradients(gradient)
|
||||
self.parameters = self.policy.get_weights()
|
||||
if batches_so_far < max_batches:
|
||||
batches_so_far += 1
|
||||
@@ -168,5 +101,5 @@ class A3CAgent(Agent):
|
||||
self.policy.set_weights(self.parameters)
|
||||
|
||||
def compute_action(self, observation):
|
||||
actions = self.policy.compute_actions(observation)
|
||||
actions = self.policy.compute_action(observation)
|
||||
return actions[0]
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import scipy.signal
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
def discount(x, gamma):
|
||||
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
|
||||
|
||||
|
||||
def process_rollout(rollout, gamma, lambda_=1.0):
|
||||
"""Given a rollout, compute its returns and the advantage."""
|
||||
batch_si = np.asarray(rollout.states)
|
||||
batch_a = np.asarray(rollout.actions)
|
||||
rewards = np.asarray(rollout.rewards)
|
||||
vpred_t = np.asarray(rollout.values + [rollout.r])
|
||||
|
||||
rewards_plus_v = np.asarray(rollout.rewards + [rollout.r])
|
||||
batch_r = discount(rewards_plus_v, gamma)[:-1]
|
||||
delta_t = rewards + gamma * vpred_t[1:] - vpred_t[:-1]
|
||||
# This formula for the advantage comes "Generalized Advantage Estimation":
|
||||
# https://arxiv.org/abs/1506.02438
|
||||
batch_adv = discount(delta_t, gamma * lambda_)
|
||||
|
||||
features = rollout.features[0]
|
||||
return Batch(batch_si, batch_a, batch_adv, batch_r, rollout.terminal,
|
||||
features)
|
||||
|
||||
|
||||
Batch = namedtuple(
|
||||
"Batch", ["si", "a", "adv", "r", "terminal", "features"])
|
||||
|
||||
CompletedRollout = namedtuple(
|
||||
"CompletedRollout", ["episode_length", "episode_reward"])
|
||||
@@ -2,99 +2,29 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
import ray
|
||||
import gym
|
||||
|
||||
|
||||
class Policy(object):
|
||||
"""The policy base class."""
|
||||
def __init__(self, ob_space, action_space, name="local", summarize=True):
|
||||
self.local_steps = 0
|
||||
self.summarize = summarize
|
||||
worker_device = "/job:localhost/replica:0/task:0/cpu:0"
|
||||
self.g = tf.Graph()
|
||||
with self.g.as_default(), tf.device(worker_device):
|
||||
with tf.variable_scope(name):
|
||||
self.setup_graph(ob_space, action_space)
|
||||
assert all([hasattr(self, attr)
|
||||
for attr in ["vf", "logits", "x", "var_list"]])
|
||||
print("Setting up loss")
|
||||
self.setup_loss(action_space)
|
||||
self.setup_gradients()
|
||||
self.initialize()
|
||||
pass
|
||||
|
||||
def setup_graph(self):
|
||||
def apply_gradients(self, grads):
|
||||
raise NotImplementedError
|
||||
|
||||
def setup_loss(self, action_space):
|
||||
if isinstance(action_space, gym.spaces.Box):
|
||||
ac_size = action_space.shape[0]
|
||||
self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac")
|
||||
elif isinstance(action_space, gym.spaces.Discrete):
|
||||
self.ac = tf.placeholder(tf.int64, [None], name="ac")
|
||||
else:
|
||||
raise NotImplemented(
|
||||
"action space" + str(type(action_space)) +
|
||||
"currently not supported")
|
||||
self.adv = tf.placeholder(tf.float32, [None], name="adv")
|
||||
self.r = tf.placeholder(tf.float32, [None], name="r")
|
||||
|
||||
log_prob = self.curr_dist.logp(self.ac)
|
||||
|
||||
# The "policy gradients" loss: its derivative is precisely the policy
|
||||
# gradient. Notice that self.ac is a placeholder that is provided
|
||||
# externally. adv will contain the advantages, as calculated in
|
||||
# process_rollout.
|
||||
self.pi_loss = - tf.reduce_sum(log_prob * self.adv)
|
||||
|
||||
delta = self.vf - self.r
|
||||
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
|
||||
self.entropy = tf.reduce_sum(self.curr_dist.entropy())
|
||||
self.loss = self.pi_loss + 0.5 * self.vf_loss - self.entropy * 0.01
|
||||
|
||||
def setup_gradients(self):
|
||||
grads = tf.gradients(self.loss, self.var_list)
|
||||
self.grads, _ = tf.clip_by_global_norm(grads, 40.0)
|
||||
grads_and_vars = list(zip(self.grads, self.var_list))
|
||||
opt = tf.train.AdamOptimizer(1e-4)
|
||||
self._apply_gradients = opt.apply_gradients(grads_and_vars)
|
||||
|
||||
def initialize(self):
|
||||
if self.summarize:
|
||||
bs = tf.to_float(tf.shape(self.x)[0])
|
||||
tf.summary.scalar("model/policy_loss", self.pi_loss / bs)
|
||||
tf.summary.scalar("model/value_loss", self.vf_loss / bs)
|
||||
tf.summary.scalar("model/entropy", self.entropy / bs)
|
||||
tf.summary.scalar("model/grad_gnorm", tf.global_norm(self.grads))
|
||||
tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
|
||||
self.summary_op = tf.summary.merge_all()
|
||||
|
||||
self.sess = tf.Session(graph=self.g, config=tf.ConfigProto(
|
||||
intra_op_parallelism_threads=1, inter_op_parallelism_threads=2))
|
||||
self.variables = ray.experimental.TensorFlowVariables(self.loss,
|
||||
self.sess)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def model_update(self, grads):
|
||||
feed_dict = {self.grads[i]: grads[i]
|
||||
for i in range(len(grads))}
|
||||
self.sess.run(self._apply_gradients, feed_dict=feed_dict)
|
||||
|
||||
def get_weights(self):
|
||||
weights = self.variables.get_weights()
|
||||
return weights
|
||||
raise NotImplementedError
|
||||
|
||||
def set_weights(self, weights):
|
||||
self.variables.set_weights(weights)
|
||||
raise NotImplementedError
|
||||
|
||||
def get_gradients(self, batch):
|
||||
def compute_gradients(self, batch):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_vf_loss(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def compute_actions(self, observations):
|
||||
def compute_action(self, observations):
|
||||
"""Compute action for a _single_ observation"""
|
||||
raise NotImplementedError
|
||||
|
||||
def value(self, ob):
|
||||
|
||||
+66
-166
@@ -2,182 +2,82 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from collections import namedtuple
|
||||
import numpy as np
|
||||
from ray.rllib.a3c.envs import create_and_wrap
|
||||
import tensorflow as tf
|
||||
import six.moves.queue as queue
|
||||
import scipy.signal
|
||||
import threading
|
||||
from ray.rllib.a3c.runner_thread import RunnerThread
|
||||
from ray.rllib.a3c.common import process_rollout
|
||||
from ray.rllib.a3c.tfpolicy import TFPolicy
|
||||
import ray
|
||||
import os
|
||||
|
||||
|
||||
def discount(x, gamma):
|
||||
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
|
||||
class Runner(object):
|
||||
"""Actor object to start running simulation on workers.
|
||||
|
||||
|
||||
def process_rollout(rollout, gamma, lambda_=1.0):
|
||||
"""Given a rollout, compute its returns and the advantage."""
|
||||
batch_si = np.asarray(rollout.states)
|
||||
batch_a = np.asarray(rollout.actions)
|
||||
rewards = np.asarray(rollout.rewards)
|
||||
vpred_t = np.asarray(rollout.values + [rollout.r])
|
||||
|
||||
rewards_plus_v = np.asarray(rollout.rewards + [rollout.r])
|
||||
batch_r = discount(rewards_plus_v, gamma)[:-1]
|
||||
delta_t = rewards + gamma * vpred_t[1:] - vpred_t[:-1]
|
||||
# This formula for the advantage comes "Generalized Advantage Estimation":
|
||||
# https://arxiv.org/abs/1506.02438
|
||||
batch_adv = discount(delta_t, gamma * lambda_)
|
||||
|
||||
features = rollout.features[0]
|
||||
return Batch(batch_si, batch_a, batch_adv, batch_r, rollout.terminal,
|
||||
features)
|
||||
|
||||
|
||||
Batch = namedtuple(
|
||||
"Batch", ["si", "a", "adv", "r", "terminal", "features"])
|
||||
|
||||
CompletedRollout = namedtuple(
|
||||
"CompletedRollout", ["episode_length", "episode_reward"])
|
||||
|
||||
|
||||
class PartialRollout(object):
|
||||
"""A piece of a complete rollout.
|
||||
|
||||
We run our agent, and process its experience once it has processed enough
|
||||
steps.
|
||||
The gradient computation is also executed from this object.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.states = []
|
||||
self.actions = []
|
||||
self.rewards = []
|
||||
self.values = []
|
||||
self.r = 0.0
|
||||
self.terminal = False
|
||||
self.features = []
|
||||
|
||||
def add(self, state, action, reward, value, terminal, features):
|
||||
self.states += [state]
|
||||
self.actions += [action]
|
||||
self.rewards += [reward]
|
||||
self.values += [value]
|
||||
self.terminal = terminal
|
||||
self.features += [features]
|
||||
|
||||
def extend(self, other):
|
||||
assert not self.terminal
|
||||
self.states.extend(other.states)
|
||||
self.actions.extend(other.actions)
|
||||
self.rewards.extend(other.rewards)
|
||||
self.values.extend(other.values)
|
||||
self.r = other.r
|
||||
self.terminal = other.terminal
|
||||
self.features.extend(other.features)
|
||||
|
||||
|
||||
class RunnerThread(threading.Thread):
|
||||
"""This thread interacts with the environment and tells it what to do."""
|
||||
def __init__(self, env, policy, num_local_steps, visualise=False):
|
||||
threading.Thread.__init__(self)
|
||||
self.queue = queue.Queue(5)
|
||||
self.metrics_queue = queue.Queue()
|
||||
self.num_local_steps = num_local_steps
|
||||
def __init__(self, env_creator, policy_cls, actor_id, batch_size,
|
||||
preprocess_config, logdir):
|
||||
env = create_and_wrap(env_creator, preprocess_config)
|
||||
self.id = actor_id
|
||||
# TODO(rliaw): should change this to be just env.observation_space
|
||||
self.policy = policy_cls(env.observation_space.shape, env.action_space)
|
||||
self.runner = RunnerThread(env, self.policy, batch_size)
|
||||
self.env = env
|
||||
self.last_features = None
|
||||
self.policy = policy
|
||||
self.daemon = True
|
||||
self.sess = None
|
||||
self.summary_writer = None
|
||||
self.visualise = visualise
|
||||
|
||||
def start_runner(self, sess, summary_writer):
|
||||
self.sess = sess
|
||||
self.summary_writer = summary_writer
|
||||
self.logdir = logdir
|
||||
self.start()
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
with self.sess.as_default():
|
||||
self._run()
|
||||
except BaseException as e:
|
||||
self.queue.put(e)
|
||||
raise e
|
||||
def pull_batch_from_queue(self):
|
||||
"""Take a rollout from the queue of the thread runner."""
|
||||
rollout = self.runner.queue.get(timeout=600.0)
|
||||
if isinstance(rollout, BaseException):
|
||||
raise rollout
|
||||
while not rollout.terminal:
|
||||
try:
|
||||
part = self.runner.queue.get_nowait()
|
||||
if isinstance(part, BaseException):
|
||||
raise rollout
|
||||
rollout.extend(part)
|
||||
except queue.Empty:
|
||||
break
|
||||
return rollout
|
||||
|
||||
def _run(self):
|
||||
rollout_provider = env_runner(
|
||||
self.env, self.policy, self.num_local_steps,
|
||||
self.summary_writer, self.visualise)
|
||||
def get_completed_rollout_metrics(self):
|
||||
"""Returns metrics on previously completed rollouts.
|
||||
|
||||
Calling this clears the queue of completed rollout metrics.
|
||||
"""
|
||||
completed = []
|
||||
while True:
|
||||
# The timeout variable exists because apparently, if one worker
|
||||
# dies, the other workers won't die with it, unless the timeout is
|
||||
# set to some large number. This is an empirical observation.
|
||||
item = next(rollout_provider)
|
||||
if isinstance(item, CompletedRollout):
|
||||
self.metrics_queue.put(item)
|
||||
else:
|
||||
self.queue.put(item, timeout=600.0)
|
||||
try:
|
||||
completed.append(self.runner.metrics_queue.get_nowait())
|
||||
except queue.Empty:
|
||||
break
|
||||
return completed
|
||||
|
||||
def start(self):
|
||||
summary_writer = tf.summary.FileWriter(
|
||||
os.path.join(self.logdir, "agent_%d" % self.id))
|
||||
self.summary_writer = summary_writer
|
||||
if isinstance(self.policy, TFPolicy):
|
||||
self.runner.start_runner(self.policy.sess, summary_writer)
|
||||
else:
|
||||
self.runner.start_runner(tf.Session(), summary_writer)
|
||||
|
||||
def compute_gradient(self, params):
|
||||
self.policy.set_weights(params)
|
||||
rollout = self.pull_batch_from_queue()
|
||||
batch = process_rollout(rollout, gamma=0.99, lambda_=1.0)
|
||||
gradient, info = self.policy.compute_gradients(batch)
|
||||
if "summary" in info:
|
||||
self.summary_writer.add_summary(
|
||||
tf.Summary.FromString(info['summary']),
|
||||
self.policy.local_steps)
|
||||
self.summary_writer.flush()
|
||||
info = {"id": self.id,
|
||||
"size": len(batch.a)}
|
||||
return gradient, info
|
||||
|
||||
|
||||
def env_runner(env, policy, num_local_steps, summary_writer, render):
|
||||
"""This implements the logic of the thread runner.
|
||||
|
||||
It continually runs the policy, and as long as the rollout exceeds a
|
||||
certain length, the thread runner appends the policy to the queue.
|
||||
"""
|
||||
last_state = env.reset()
|
||||
timestep_limit = env.spec.tags.get("wrapper_config.TimeLimit"
|
||||
".max_episode_steps")
|
||||
last_features = policy.get_initial_features()
|
||||
length = 0
|
||||
rewards = 0
|
||||
rollout_number = 0
|
||||
|
||||
while True:
|
||||
terminal_end = False
|
||||
rollout = PartialRollout()
|
||||
|
||||
for _ in range(num_local_steps):
|
||||
fetched = policy.compute_actions(last_state, *last_features)
|
||||
action, value_, features = fetched[0], fetched[1], fetched[2:]
|
||||
# Argmax to convert from one-hot.
|
||||
state, reward, terminal, info = env.step(action)
|
||||
if render:
|
||||
env.render()
|
||||
|
||||
length += 1
|
||||
rewards += reward
|
||||
if length >= timestep_limit:
|
||||
terminal = True
|
||||
|
||||
# Collect the experience.
|
||||
rollout.add(last_state, action, reward, value_, terminal,
|
||||
last_features)
|
||||
|
||||
last_state = state
|
||||
last_features = features
|
||||
|
||||
if info:
|
||||
summary = tf.Summary()
|
||||
for k, v in info.items():
|
||||
summary.value.add(tag=k, simple_value=float(v))
|
||||
summary_writer.add_summary(summary, rollout_number)
|
||||
summary_writer.flush()
|
||||
|
||||
if terminal:
|
||||
terminal_end = True
|
||||
yield CompletedRollout(length, rewards)
|
||||
|
||||
if (length >= timestep_limit or
|
||||
not env.metadata.get("semantics.autoreset")):
|
||||
last_state = env.reset()
|
||||
last_features = policy.get_initial_features()
|
||||
rollout_number += 1
|
||||
length = 0
|
||||
rewards = 0
|
||||
break
|
||||
|
||||
if not terminal_end:
|
||||
rollout.r = policy.value(last_state, *last_features)
|
||||
|
||||
# Once we have enough experience, yield it, and have the ThreadRunner
|
||||
# place it on a queue.
|
||||
yield rollout
|
||||
RemoteRunner = ray.remote(Runner)
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
import six.moves.queue as queue
|
||||
import threading
|
||||
from ray.rllib.a3c.common import CompletedRollout
|
||||
|
||||
|
||||
class PartialRollout(object):
|
||||
"""A piece of a complete rollout.
|
||||
|
||||
We run our agent, and process its experience once it has processed enough
|
||||
steps.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.states = []
|
||||
self.actions = []
|
||||
self.rewards = []
|
||||
self.values = []
|
||||
self.r = 0.0
|
||||
self.terminal = False
|
||||
self.features = []
|
||||
|
||||
def add(self, state, action, reward, value, terminal, features):
|
||||
self.states += [state]
|
||||
self.actions += [action]
|
||||
self.rewards += [reward]
|
||||
self.values += [value]
|
||||
self.terminal = terminal
|
||||
self.features += [features]
|
||||
|
||||
def extend(self, other):
|
||||
assert not self.terminal
|
||||
self.states.extend(other.states)
|
||||
self.actions.extend(other.actions)
|
||||
self.rewards.extend(other.rewards)
|
||||
self.values.extend(other.values)
|
||||
self.r = other.r
|
||||
self.terminal = other.terminal
|
||||
self.features.extend(other.features)
|
||||
|
||||
|
||||
class RunnerThread(threading.Thread):
|
||||
"""This thread interacts with the environment and tells it what to do."""
|
||||
def __init__(self, env, policy, num_local_steps, visualise=False):
|
||||
threading.Thread.__init__(self)
|
||||
self.queue = queue.Queue(5)
|
||||
self.metrics_queue = queue.Queue()
|
||||
self.num_local_steps = num_local_steps
|
||||
self.env = env
|
||||
self.last_features = None
|
||||
self.policy = policy
|
||||
self.daemon = True
|
||||
self.sess = None
|
||||
self.summary_writer = None
|
||||
self.visualise = visualise
|
||||
|
||||
def start_runner(self, sess, summary_writer):
|
||||
self.sess = sess
|
||||
self.summary_writer = summary_writer
|
||||
self.start()
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
with self.sess.as_default():
|
||||
self._run()
|
||||
except BaseException as e:
|
||||
self.queue.put(e)
|
||||
raise e
|
||||
|
||||
def _run(self):
|
||||
rollout_provider = env_runner(
|
||||
self.env, self.policy, self.num_local_steps,
|
||||
self.summary_writer, self.visualise)
|
||||
while True:
|
||||
# The timeout variable exists because apparently, if one worker
|
||||
# dies, the other workers won't die with it, unless the timeout is
|
||||
# set to some large number. This is an empirical observation.
|
||||
item = next(rollout_provider)
|
||||
if isinstance(item, CompletedRollout):
|
||||
self.metrics_queue.put(item)
|
||||
else:
|
||||
self.queue.put(item, timeout=600.0)
|
||||
|
||||
|
||||
def env_runner(env, policy, num_local_steps, summary_writer, render):
|
||||
"""This implements the logic of the thread runner.
|
||||
|
||||
It continually runs the policy, and as long as the rollout exceeds a
|
||||
certain length, the thread runner appends the policy to the queue.
|
||||
"""
|
||||
last_state = env.reset()
|
||||
timestep_limit = env.spec.tags.get("wrapper_config.TimeLimit"
|
||||
".max_episode_steps")
|
||||
last_features = policy.get_initial_features()
|
||||
length = 0
|
||||
rewards = 0
|
||||
rollout_number = 0
|
||||
|
||||
while True:
|
||||
terminal_end = False
|
||||
rollout = PartialRollout()
|
||||
|
||||
for _ in range(num_local_steps):
|
||||
fetched = policy.compute_action(last_state, *last_features)
|
||||
action, value_, features = fetched[0], fetched[1], fetched[2:]
|
||||
# Argmax to convert from one-hot.
|
||||
state, reward, terminal, info = env.step(action)
|
||||
if render:
|
||||
env.render()
|
||||
|
||||
length += 1
|
||||
rewards += reward
|
||||
if length >= timestep_limit:
|
||||
terminal = True
|
||||
|
||||
# Collect the experience.
|
||||
rollout.add(last_state, action, reward, value_, terminal,
|
||||
last_features)
|
||||
|
||||
last_state = state
|
||||
last_features = features
|
||||
|
||||
if info:
|
||||
summary = tf.Summary()
|
||||
for k, v in info.items():
|
||||
summary.value.add(tag=k, simple_value=float(v))
|
||||
summary_writer.add_summary(summary, rollout_number)
|
||||
summary_writer.flush()
|
||||
|
||||
if terminal:
|
||||
terminal_end = True
|
||||
yield CompletedRollout(length, rewards)
|
||||
|
||||
if (length >= timestep_limit or
|
||||
not env.metadata.get("semantics.autoreset")):
|
||||
last_state = env.reset()
|
||||
last_features = policy.get_initial_features()
|
||||
rollout_number += 1
|
||||
length = 0
|
||||
rewards = 0
|
||||
break
|
||||
|
||||
if not terminal_end:
|
||||
rollout.r = policy.value(last_state, *last_features)
|
||||
|
||||
# Once we have enough experience, yield it, and have the ThreadRunner
|
||||
# place it on a queue.
|
||||
yield rollout
|
||||
@@ -4,11 +4,11 @@ from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
from ray.rllib.models.misc import linear, normc_initializer
|
||||
from ray.rllib.a3c.policy import Policy
|
||||
from ray.rllib.a3c.tfpolicy import TFPolicy
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
|
||||
|
||||
class SharedModel(Policy):
|
||||
class SharedModel(TFPolicy):
|
||||
def __init__(self, ob_space, ac_space, **kwargs):
|
||||
super(SharedModel, self).__init__(ob_space, ac_space, **kwargs)
|
||||
|
||||
@@ -31,7 +31,7 @@ class SharedModel(Policy):
|
||||
initializer=tf.constant_initializer(0, dtype=tf.int32),
|
||||
trainable=False)
|
||||
|
||||
def get_gradients(self, batch):
|
||||
def compute_gradients(self, batch):
|
||||
info = {}
|
||||
feed_dict = {
|
||||
self.x: batch.si,
|
||||
@@ -49,13 +49,14 @@ class SharedModel(Policy):
|
||||
grad = self.sess.run(self.grads, feed_dict=feed_dict)
|
||||
return grad, info
|
||||
|
||||
def compute_actions(self, ob, *args):
|
||||
def compute_action(self, ob, *args):
|
||||
action, vf = self.sess.run([self.sample, self.vf],
|
||||
{self.x: [ob]})
|
||||
return action[0], vf
|
||||
return action[0], vf[0]
|
||||
|
||||
def value(self, ob, *args):
|
||||
return self.sess.run(self.vf, {self.x: [ob]})[0]
|
||||
vf = self.sess.run(self.vf, {self.x: [ob]})
|
||||
return vf[0]
|
||||
|
||||
def get_initial_features(self):
|
||||
return []
|
||||
|
||||
@@ -5,11 +5,11 @@ from __future__ import print_function
|
||||
import tensorflow as tf
|
||||
from ray.rllib.models.misc import linear, normc_initializer
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
from ray.rllib.a3c.policy import Policy
|
||||
from ray.rllib.a3c.tfpolicy import TFPolicy
|
||||
from ray.rllib.models.lstm import LSTM
|
||||
|
||||
|
||||
class SharedModelLSTM(Policy):
|
||||
class SharedModelLSTM(TFPolicy):
|
||||
|
||||
def __init__(self, ob_space, ac_space, **kwargs):
|
||||
super(SharedModelLSTM, self).__init__(ob_space, ac_space, **kwargs)
|
||||
@@ -38,7 +38,7 @@ class SharedModelLSTM(Policy):
|
||||
initializer=tf.constant_initializer(0, dtype=tf.int32),
|
||||
trainable=False)
|
||||
|
||||
def get_gradients(self, batch):
|
||||
def compute_gradients(self, batch):
|
||||
"""Computing the gradient is actually model-dependent.
|
||||
|
||||
The LSTM needs its hidden states in order to compute the gradient
|
||||
@@ -62,20 +62,18 @@ class SharedModelLSTM(Policy):
|
||||
grad = self.sess.run(self.grads, feed_dict=feed_dict)
|
||||
return grad, info
|
||||
|
||||
def compute_actions(self, ob, c, h):
|
||||
output = self.sess.run([self.sample, self.vf] + self.state_out,
|
||||
{self.x: [ob],
|
||||
self.state_in[0]: c,
|
||||
self.state_in[1]: h})
|
||||
output = list(output)
|
||||
output[0] = output[0][0]
|
||||
return output
|
||||
def compute_action(self, ob, c, h):
|
||||
action, vf, c, h = self.sess.run(
|
||||
[self.sample, self.vf] + self.state_out,
|
||||
{self.x: [ob], self.state_in[0]: c, self.state_in[1]: h})
|
||||
return action[0], vf[0], c, h
|
||||
|
||||
def value(self, ob, c, h):
|
||||
# process_rollout is very non-intuitive due to value being a float
|
||||
return self.sess.run(self.vf, {self.x: [ob],
|
||||
self.state_in[0]: c,
|
||||
self.state_in[1]: h})[0]
|
||||
vf = self.sess.run(self.vf, {self.x: [ob],
|
||||
self.state_in[0]: c,
|
||||
self.state_in[1]: h})
|
||||
return vf[0]
|
||||
|
||||
def get_initial_features(self):
|
||||
return self.state_init
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
import ray
|
||||
import gym
|
||||
from ray.rllib.a3c.policy import Policy
|
||||
|
||||
|
||||
class TFPolicy(Policy):
|
||||
"""The policy base class."""
|
||||
def __init__(self, ob_space, action_space, name="local", summarize=True):
|
||||
self.local_steps = 0
|
||||
self.summarize = summarize
|
||||
worker_device = "/job:localhost/replica:0/task:0/cpu:0"
|
||||
self.g = tf.Graph()
|
||||
with self.g.as_default(), tf.device(worker_device):
|
||||
with tf.variable_scope(name):
|
||||
self.setup_graph(ob_space, action_space)
|
||||
assert all([hasattr(self, attr)
|
||||
for attr in ["vf", "logits", "x", "var_list"]])
|
||||
print("Setting up loss")
|
||||
self.setup_loss(action_space)
|
||||
self.setup_gradients()
|
||||
self.initialize()
|
||||
|
||||
def setup_graph(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def setup_loss(self, action_space):
|
||||
if isinstance(action_space, gym.spaces.Box):
|
||||
ac_size = action_space.shape[0]
|
||||
self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac")
|
||||
elif isinstance(action_space, gym.spaces.Discrete):
|
||||
self.ac = tf.placeholder(tf.int64, [None], name="ac")
|
||||
else:
|
||||
raise NotImplemented(
|
||||
"action space" + str(type(action_space)) +
|
||||
"currently not supported")
|
||||
self.adv = tf.placeholder(tf.float32, [None], name="adv")
|
||||
self.r = tf.placeholder(tf.float32, [None], name="r")
|
||||
|
||||
log_prob = self.curr_dist.logp(self.ac)
|
||||
|
||||
# The "policy gradients" loss: its derivative is precisely the policy
|
||||
# gradient. Notice that self.ac is a placeholder that is provided
|
||||
# externally. adv will contain the advantages, as calculated in
|
||||
# process_rollout.
|
||||
self.pi_loss = - tf.reduce_sum(log_prob * self.adv)
|
||||
|
||||
delta = self.vf - self.r
|
||||
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
|
||||
self.entropy = tf.reduce_sum(self.curr_dist.entropy())
|
||||
self.loss = self.pi_loss + 0.5 * self.vf_loss - self.entropy * 0.01
|
||||
|
||||
def setup_gradients(self):
|
||||
grads = tf.gradients(self.loss, self.var_list)
|
||||
self.grads, _ = tf.clip_by_global_norm(grads, 40.0)
|
||||
grads_and_vars = list(zip(self.grads, self.var_list))
|
||||
opt = tf.train.AdamOptimizer(1e-4)
|
||||
self._apply_gradients = opt.apply_gradients(grads_and_vars)
|
||||
|
||||
def initialize(self):
|
||||
if self.summarize:
|
||||
bs = tf.to_float(tf.shape(self.x)[0])
|
||||
tf.summary.scalar("model/policy_loss", self.pi_loss / bs)
|
||||
tf.summary.scalar("model/value_loss", self.vf_loss / bs)
|
||||
tf.summary.scalar("model/entropy", self.entropy / bs)
|
||||
tf.summary.scalar("model/grad_gnorm", tf.global_norm(self.grads))
|
||||
tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
|
||||
self.summary_op = tf.summary.merge_all()
|
||||
|
||||
self.sess = tf.Session(graph=self.g, config=tf.ConfigProto(
|
||||
intra_op_parallelism_threads=1, inter_op_parallelism_threads=2))
|
||||
self.variables = ray.experimental.TensorFlowVariables(self.loss,
|
||||
self.sess)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def apply_gradients(self, grads):
|
||||
feed_dict = {self.grads[i]: grads[i]
|
||||
for i in range(len(grads))}
|
||||
self.sess.run(self._apply_gradients, feed_dict=feed_dict)
|
||||
|
||||
def get_weights(self):
|
||||
weights = self.variables.get_weights()
|
||||
return weights
|
||||
|
||||
def set_weights(self, weights):
|
||||
self.variables.set_weights(weights)
|
||||
|
||||
def compute_gradients(self, batch):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_vf_loss(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def compute_action(self, observations):
|
||||
raise NotImplementedError
|
||||
|
||||
def value(self, ob):
|
||||
raise NotImplementedError
|
||||
@@ -21,7 +21,8 @@ MODEL_CONFIGS = [
|
||||
"extra_frameskip", # (int) for number of frames to skip
|
||||
"fcnet_activation", # Nonlinearity for fully connected net (tanh, relu)
|
||||
"fcnet_hiddens", # Number of hidden layers for fully connected net
|
||||
"free_log_std" # Documented in ray.rllib.models.Model
|
||||
"free_log_std", # Documented in ray.rllib.models.Model
|
||||
"channel_major", # Pytorch conv requires images to be channel-major
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -30,11 +30,16 @@ class AtariPixelPreprocessor(Preprocessor):
|
||||
self._grayscale = self._options.get("grayscale", False)
|
||||
self._zero_mean = self._options.get("zero_mean", True)
|
||||
self._dim = self._options.get("dim", 80)
|
||||
self._pytorch = self._options.get("pytorch", False)
|
||||
if self._grayscale:
|
||||
self.shape = (self._dim, self._dim, 1)
|
||||
else:
|
||||
self.shape = (self._dim, self._dim, 3)
|
||||
|
||||
# pytorch requires (# in-channels, row dim, col dim)
|
||||
if self._pytorch:
|
||||
self.shape = self.shape[::-1]
|
||||
|
||||
# TODO(ekl) why does this need to return an extra size-1 dim (the [None])
|
||||
def transform(self, observation):
|
||||
"""Downsamples images from (210, 160, 3) by the configured factor."""
|
||||
@@ -54,6 +59,8 @@ class AtariPixelPreprocessor(Preprocessor):
|
||||
scaled = (scaled - 128) / 128
|
||||
else:
|
||||
scaled *= 1.0 / 255.0
|
||||
if self._pytorch:
|
||||
scaled = np.reshape(scaled, self.shape)
|
||||
return scaled
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user