From afdc87323f40918d758ba0a02fc650808f7e5796 Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Sun, 12 Nov 2017 00:20:33 -0800
Subject: [PATCH] [rllib] PyTorch Models for A3C (#1187)

* fixing policy

* Compute Action is singular, fixed weird issue with arrays

* remove vestige

* extraneous ipdb

* Can Drop in Pytorch Model

* lint

* introducing models

* fix base policy

* Missed this from last time

* lint

* removedolds

* getting vision working

* LINT

* trying to fix test dependencies

* requiremnets

* try

* tryconda

* yes

* shutup

* flake_passes

* changes

* removing weight initializer for lstm for now

* unused

* adam

* clip

* zero

* properscaling

* weight

* try

* fix up pytorch visionnet

* bias correction

* fix model

* same visionnet

* matching_bad_things

* test

* try locking

* fixing_linear

* naming

* lint

* FORJENKINS

* clouds

* lint

* Lint + removed dependencies

* removed dependencies

* format
---
 docker/examples/Dockerfile                   |  1 +
 python/ray/rllib/a3c/a3c.py                  |  6 +-
 python/ray/rllib/a3c/policy.py               |  3 -
 python/ray/rllib/a3c/shared_model.py         |  2 +-
 python/ray/rllib/a3c/shared_model_lstm.py    |  2 +-
 python/ray/rllib/a3c/shared_torch_policy.py  | 73 ++++++++++++++++++
 python/ray/rllib/a3c/tfpolicy.py             |  7 +-
 python/ray/rllib/a3c/torchpolicy.py          | 78 ++++++++++++++++++++
 python/ray/rllib/models/catalog.py           | 24 ++++++
 python/ray/rllib/models/preprocessors.py     | 10 +--
 python/ray/rllib/models/pytorch/__init__.py  |  0
 python/ray/rllib/models/pytorch/fcnet.py     | 56 ++++++++++++++
 python/ray/rllib/models/pytorch/misc.py      | 69 +++++++++++++++++
 python/ray/rllib/models/pytorch/model.py     | 70 ++++++++++++++++++
 python/ray/rllib/models/pytorch/visionnet.py | 70 ++++++++++++++++++
 test/jenkins_tests/run_multi_node_tests.sh   |  7 ++
 16 files changed, 462 insertions(+), 16 deletions(-)
 create mode 100644 python/ray/rllib/a3c/shared_torch_policy.py
 create mode 100644 python/ray/rllib/models/pytorch/__init__.py
 create mode 100644 python/ray/rllib/models/pytorch/fcnet.py
 create mode 100644 python/ray/rllib/models/pytorch/misc.py
 create mode 100644 python/ray/rllib/models/pytorch/model.py
 create mode 100644 python/ray/rllib/models/pytorch/visionnet.py

diff --git a/docker/examples/Dockerfile b/docker/examples/Dockerfile
index c74ed5ebd..5f17d31c5 100644
--- a/docker/examples/Dockerfile
+++ b/docker/examples/Dockerfile
@@ -4,3 +4,4 @@ FROM ray-project/deploy
 RUN conda install -y -c conda-forge tensorflow
 RUN apt-get install -y zlib1g-dev
 RUN pip install gym[atari] opencv-python==3.2.0.8 smart_open
+RUN conda install -y -q pytorch torchvision -c soumith
diff --git a/python/ray/rllib/a3c/a3c.py b/python/ray/rllib/a3c/a3c.py
index 6bd6c5469..41e363769 100644
--- a/python/ray/rllib/a3c/a3c.py
+++ b/python/ray/rllib/a3c/a3c.py
@@ -20,10 +20,11 @@ DEFAULT_CONFIG = {
     "num_batches_per_iteration": 100,
     "batch_size": 10,
     "use_lstm": True,
+    "use_pytorch": False,
     "model": {"grayscale": True,
               "zero_mean": False,
               "dim": 42,
-              "channel_major": True}
+              "channel_major": False}
 }
 
 
@@ -35,6 +36,9 @@ class A3CAgent(Agent):
         self.env = create_and_wrap(self.env_creator, self.config["model"])
         if self.config["use_lstm"]:
             policy_cls = SharedModelLSTM
+        elif self.config["use_pytorch"]:
+            from ray.rllib.a3c.shared_torch_policy import SharedTorchPolicy
+            policy_cls = SharedTorchPolicy
         else:
             policy_cls = SharedModel
         self.policy = policy_cls(
diff --git a/python/ray/rllib/a3c/policy.py b/python/ray/rllib/a3c/policy.py
index 2b01aaeb1..ee18912bb 100644
--- a/python/ray/rllib/a3c/policy.py
+++ b/python/ray/rllib/a3c/policy.py
@@ -20,9 +20,6 @@ class Policy(object):
     def compute_gradients(self, batch):
         raise NotImplementedError
 
-    def get_vf_loss(self):
-        raise NotImplementedError
-
     def compute_action(self, observations):
         """Compute action for a _single_ observation"""
         raise NotImplementedError
diff --git a/python/ray/rllib/a3c/shared_model.py b/python/ray/rllib/a3c/shared_model.py
index bdf3900c5..ac5fcae44 100644
--- a/python/ray/rllib/a3c/shared_model.py
+++ b/python/ray/rllib/a3c/shared_model.py
@@ -12,7 +12,7 @@ class SharedModel(TFPolicy):
     def __init__(self, ob_space, ac_space, **kwargs):
         super(SharedModel, self).__init__(ob_space, ac_space, **kwargs)
 
-    def setup_graph(self, ob_space, ac_space):
+    def _setup_graph(self, ob_space, ac_space):
         self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
         dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
         self._model = ModelCatalog.get_model(self.x, self.logit_dim)
diff --git a/python/ray/rllib/a3c/shared_model_lstm.py b/python/ray/rllib/a3c/shared_model_lstm.py
index 17b304749..f6b5b2619 100644
--- a/python/ray/rllib/a3c/shared_model_lstm.py
+++ b/python/ray/rllib/a3c/shared_model_lstm.py
@@ -14,7 +14,7 @@ class SharedModelLSTM(TFPolicy):
     def __init__(self, ob_space, ac_space, **kwargs):
         super(SharedModelLSTM, self).__init__(ob_space, ac_space, **kwargs)
 
-    def setup_graph(self, ob_space, ac_space):
+    def _setup_graph(self, ob_space, ac_space):
         self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
         dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
         self._model = LSTM(self.x, self.logit_dim, {})
diff --git a/python/ray/rllib/a3c/shared_torch_policy.py b/python/ray/rllib/a3c/shared_torch_policy.py
new file mode 100644
index 000000000..b29e5541b
--- /dev/null
+++ b/python/ray/rllib/a3c/shared_torch_policy.py
@@ -0,0 +1,73 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+from torch.autograd import Variable
+import torch.nn.functional as F
+
+from ray.rllib.a3c.torchpolicy import TorchPolicy
+from ray.rllib.models.pytorch.misc import var_to_np, convert_batch
+from ray.rllib.models.catalog import ModelCatalog
+
+
+class SharedTorchPolicy(TorchPolicy):
+    """Assumes nonrecurrent."""
+
+    def __init__(self, ob_space, ac_space, **kwargs):
+        super(SharedTorchPolicy, self).__init__(
+            ob_space, ac_space, **kwargs)
+
+    def _setup_graph(self, ob_space, ac_space):
+        _, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
+        self._model = ModelCatalog.get_torch_model(ob_space, self.logit_dim)
+        self.optimizer = torch.optim.Adam(self._model.parameters(), lr=0.0001)
+
+    def compute_action(self, ob, *args):
+        """Should take in a SINGLE ob"""
+        with self.lock:
+            ob = Variable(torch.from_numpy(ob).float().unsqueeze(0))
+            logits, values = self._model(ob)
+            samples = self._model.probs(logits).multinomial().squeeze()
+            values = values.squeeze(0)
+            return var_to_np(samples), var_to_np(values)
+
+    def compute_logits(self, ob, *args):
+        with self.lock:
+            ob = Variable(torch.from_numpy(ob).float().unsqueeze(0))
+            res = self._model.hidden_layers(ob)
+            return var_to_np(self._model.logits(res))
+
+    def value(self, ob, *args):
+        with self.lock:
+            ob = Variable(torch.from_numpy(ob).float().unsqueeze(0))
+            res = self._model.hidden_layers(ob)
+            res = self._model.value_branch(res)
+            res = res.squeeze(0)
+            return var_to_np(res)
+
+    def _evaluate(self, obs, actions):
+        """Passes in multiple obs."""
+        logits, values = self._model(obs)
+        log_probs = F.log_softmax(logits)
+        probs = self._model.probs(logits)
+        action_log_probs = log_probs.gather(1, actions.view(-1, 1))
+        entropy = -(log_probs * probs).sum(-1).sum()
+        return values, action_log_probs, entropy
+
+    def _backward(self, batch):
+        """Loss is encoded in here. Defining a new loss function
+        would start by rewriting this function"""
+
+        states, acs, advs, rs, _ = convert_batch(batch)
+        values, ac_logprobs, entropy = self._evaluate(states, acs)
+        pi_err = -(advs * ac_logprobs).sum()
+        value_err = 0.5 * (values - rs).pow(2).sum()
+
+        self.optimizer.zero_grad()
+        overall_err = 0.5 * value_err + pi_err - entropy * 0.01
+        overall_err.backward()
+        torch.nn.utils.clip_grad_norm(self._model.parameters(), 40)
+
+    def get_initial_features(self):
+        return [None]
diff --git a/python/ray/rllib/a3c/tfpolicy.py b/python/ray/rllib/a3c/tfpolicy.py
index f73974b83..17d4831fa 100644
--- a/python/ray/rllib/a3c/tfpolicy.py
+++ b/python/ray/rllib/a3c/tfpolicy.py
@@ -17,7 +17,7 @@ class TFPolicy(Policy):
         self.g = tf.Graph()
         with self.g.as_default(), tf.device(worker_device):
             with tf.variable_scope(name):
-                self.setup_graph(ob_space, action_space)
+                self._setup_graph(ob_space, action_space)
                 assert all([hasattr(self, attr)
                             for attr in ["vf", "logits", "x", "var_list"]])
             print("Setting up loss")
@@ -25,7 +25,7 @@ class TFPolicy(Policy):
             self.setup_gradients()
             self.initialize()
 
-    def setup_graph(self):
+    def _setup_graph(self):
         raise NotImplementedError
 
     def setup_loss(self, action_space):
@@ -92,9 +92,6 @@ class TFPolicy(Policy):
     def compute_gradients(self, batch):
         raise NotImplementedError
 
-    def get_vf_loss(self):
-        raise NotImplementedError
-
     def compute_action(self, observations):
         raise NotImplementedError
 
diff --git a/python/ray/rllib/a3c/torchpolicy.py b/python/ray/rllib/a3c/torchpolicy.py
index e69de29bb..19ca38f30 100644
--- a/python/ray/rllib/a3c/torchpolicy.py
+++ b/python/ray/rllib/a3c/torchpolicy.py
@@ -0,0 +1,78 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+from torch.autograd import Variable
+
+from ray.rllib.a3c.policy import Policy
+from threading import Lock
+
+
+class TorchPolicy(Policy):
+    """The policy base class for Torch.
+
+    The model is a separate object than the policy. This could be changed
+    in the future."""
+
+    def __init__(self, ob_space, action_space, name="local", summarize=True):
+        self.local_steps = 0
+        self.summarize = summarize
+        self._setup_graph(ob_space, action_space)
+        torch.set_num_threads(2)
+        self.lock = Lock()
+
+    def apply_gradients(self, grads):
+        self.optimizer.zero_grad()
+        for g, p in zip(grads, self._model.parameters()):
+            p.grad = Variable(torch.from_numpy(g))
+        self.optimizer.step()
+
+    def get_weights(self):
+        # !! This only returns references to the data.
+        return self._model.state_dict()
+
+    def set_weights(self, weights):
+        with self.lock:
+            self._model.load_state_dict(weights)
+
+    def compute_gradients(self, batch):
+        """_backward generates the gradient in each model parameter.
+        This is taken out.
+
+        Args:
+            batch: Batch of data needed for gradient calculation.
+
+        Return:
+            gradients (list of np arrays): List of gradients
+            info (dict): Extra information (user-defined)"""
+        with self.lock:
+            self._backward(batch)
+            # Note that return values are just references;
+            # calling zero_grad will modify the values
+            return [p.grad.data.numpy() for p in self._model.parameters()], {}
+
+    def model_update(self, batch):
+        """Implements compute + apply
+
+        TODO(rliaw): Pytorch has nice caching property that doesn't require
+        full batch to be passed in. Can exploit that later"""
+        with self.lock:
+            self._backward(batch)
+            self.optimizer.step()
+
+    def _setup_graph(ob_space, action_space):
+        raise NotImplementedError
+
+    def _backward(self, batch):
+        """Implements the loss function and calculates the gradient.
+        Pytorch automatically generates a backward trace for each variable.
+        Assumption right now is that variables are moved, so the backward
+        trace is lost.
+
+        This function regenerates the backward trace and
+        caluclates the gradient."""
+        raise NotImplementedError
+
+    def get_initial_features(self):
+        return []
diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py
index a23db88dd..b71f3623e 100644
--- a/python/ray/rllib/models/catalog.py
+++ b/python/ray/rllib/models/catalog.py
@@ -85,6 +85,30 @@ class ModelCatalog(object):
 
         return FullyConnectedNetwork(inputs, num_outputs, options)
 
+    @staticmethod
+    def get_torch_model(input_shape, num_outputs, options=dict()):
+        """Returns a PyTorch suitable model.
+
+        Args:
+            input_shape (tup): The input shape to the model.
+            num_outputs (int): The size of the output vector of the model.
+            options (dict): Optional args to pass to the model constructor.
+
+        Returns:
+            model (Model): Neural network model.
+        """
+        from ray.rllib.models.pytorch.fcnet import (
+            FullyConnectedNetwork as PyTorchFCNet)
+        from ray.rllib.models.pytorch.visionnet import (
+            VisionNetwork as PyTorchVisionNet)
+
+        obs_rank = len(input_shape) - 1
+
+        if obs_rank > 1:
+            return PyTorchVisionNet(input_shape, num_outputs, options)
+
+        return PyTorchFCNet(input_shape[0], num_outputs, options)
+
     @classmethod
     def get_preprocessor(cls, env, options=dict()):
         """Returns a suitable processor for the given environment.
diff --git a/python/ray/rllib/models/preprocessors.py b/python/ray/rllib/models/preprocessors.py
index 93ef6a0b4..d03e03e7c 100644
--- a/python/ray/rllib/models/preprocessors.py
+++ b/python/ray/rllib/models/preprocessors.py
@@ -30,15 +30,15 @@ class AtariPixelPreprocessor(Preprocessor):
         self._grayscale = self._options.get("grayscale", False)
         self._zero_mean = self._options.get("zero_mean", True)
         self._dim = self._options.get("dim", 80)
-        self._pytorch = self._options.get("pytorch", False)
+        self._channel_major = self._options.get("channel_major", False)
         if self._grayscale:
             self.shape = (self._dim, self._dim, 1)
         else:
             self.shape = (self._dim, self._dim, 3)
 
-        # pytorch requires (# in-channels, row dim, col dim)
-        if self._pytorch:
-            self.shape = self.shape[::-1]
+        # channel_major requires (# in-channels, row dim, col dim)
+        if self._channel_major:
+            self.shape = self.shape[-1:] + self.shape[:-1]
 
     # TODO(ekl) why does this need to return an extra size-1 dim (the [None])
     def transform(self, observation):
@@ -59,7 +59,7 @@ class AtariPixelPreprocessor(Preprocessor):
             scaled = (scaled - 128) / 128
         else:
             scaled *= 1.0 / 255.0
-        if self._pytorch:
+        if self._channel_major:
             scaled = np.reshape(scaled, self.shape)
         return scaled
 
diff --git a/python/ray/rllib/models/pytorch/__init__.py b/python/ray/rllib/models/pytorch/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/ray/rllib/models/pytorch/fcnet.py b/python/ray/rllib/models/pytorch/fcnet.py
new file mode 100644
index 000000000..b67f1365b
--- /dev/null
+++ b/python/ray/rllib/models/pytorch/fcnet.py
@@ -0,0 +1,56 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.models.pytorch.model import Model, SlimFC
+from ray.rllib.models.pytorch.misc import normc_initializer
+import torch.nn as nn
+
+
+class FullyConnectedNetwork(Model):
+    """TODO(rliaw): Logits, Value should both be contained here"""
+    def _init(self, inputs, num_outputs, options):
+        assert type(inputs) is int
+        hiddens = options.get("fcnet_hiddens", [256, 256])
+        fcnet_activation = options.get("fcnet_activation", "tanh")
+        activation = None
+        if fcnet_activation == "tanh":
+            activation = nn.Tanh
+        elif fcnet_activation == "relu":
+            activation = nn.ReLU
+        print("Constructing fcnet {} {}".format(hiddens, activation))
+
+        layers = []
+        last_layer_size = inputs
+        for size in hiddens:
+            layers.append(SlimFC(
+                last_layer_size, size,
+                initializer=normc_initializer(1.0),
+                activation_fn=activation))
+            last_layer_size = size
+
+        self.hidden_layers = nn.Sequential(*layers)
+
+        self.logits = SlimFC(
+            last_layer_size, num_outputs,
+            initializer=normc_initializer(0.01),
+            activation_fn=None)
+        self.probs = nn.Softmax()
+        self.value_branch = SlimFC(
+            last_layer_size, 1,
+            initializer=normc_initializer(1.0),
+            activation_fn=None)
+
+    def forward(self, obs):
+        """ Internal method - pass in Variables, not numpy arrays
+
+        Args:
+            obs: observations and features
+
+        Return:
+            logits: logits to be sampled from for each state
+            value: value function for each state"""
+        res = self.hidden_layers(obs)
+        logits = self.logits(res)
+        value = self.value_branch(res)
+        return logits, value
diff --git a/python/ray/rllib/models/pytorch/misc.py b/python/ray/rllib/models/pytorch/misc.py
new file mode 100644
index 000000000..0c4b16f00
--- /dev/null
+++ b/python/ray/rllib/models/pytorch/misc.py
@@ -0,0 +1,69 @@
+""" Code adapted from https://github.com/ikostrikov/pytorch-a3c"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import torch
+from torch.autograd import Variable
+
+
+def convert_batch(batch, has_features=False):
+    """Convert batch from numpy to PT variable"""
+    states = Variable(torch.from_numpy(batch.si).float())
+    acs = Variable(torch.from_numpy(batch.a))
+    advs = Variable(torch.from_numpy(batch.adv.copy()).float())
+    advs = advs.view(-1, 1)
+    rs = Variable(torch.from_numpy(batch.r.copy()).float())
+    rs = rs.view(-1, 1)
+    if has_features:
+        features = [Variable(torch.from_numpy(f))
+                    for f in batch.features]
+    else:
+        features = batch.features
+    return states, acs, advs, rs, features
+
+
+def var_to_np(var):
+    return var.data.numpy()[0]
+
+
+def normc_initializer(std=1.0):
+    def initializer(tensor):
+        tensor.data.normal_(0, 1)
+        tensor.data *= std / torch.sqrt(
+            tensor.data.pow(2).sum(1, keepdim=True))
+    return initializer
+
+
+def valid_padding(in_size, filter_size, stride_size):
+    """Note: Padding is added to match TF conv2d `same` padding. See
+    www.tensorflow.org/versions/r0.12/api_docs/python/nn/convolution
+
+    Params:
+        in_size (tuple): Rows (Height), Column (Width) for input
+        stride_size (tuple): Rows (Height), Column (Width) for stride
+        filter_size (tuple): Rows (Height), Column (Width) for filter
+
+    Output:
+        padding (tuple): For input into torch.nn.ZeroPad2d
+        output (tuple): Output shape after padding and convolution
+    """
+    in_height, in_width = in_size
+    filter_height, filter_width = filter_size
+    stride_height, stride_width = stride_size
+
+    out_height = np.ceil(float(in_height) / float(stride_height))
+    out_width = np.ceil(float(in_width) / float(stride_width))
+
+    pad_along_height = int(
+        ((out_height - 1) * stride_height + filter_height - in_height))
+    pad_along_width = int(
+        ((out_width - 1) * stride_width + filter_width - in_width))
+    pad_top = pad_along_height // 2
+    pad_bottom = pad_along_height - pad_top
+    pad_left = pad_along_width // 2
+    pad_right = pad_along_width - pad_left
+    padding = (pad_left, pad_right, pad_top, pad_bottom)
+    output = (out_height, out_width)
+    return padding, output
diff --git a/python/ray/rllib/models/pytorch/model.py b/python/ray/rllib/models/pytorch/model.py
new file mode 100644
index 000000000..fd1577f33
--- /dev/null
+++ b/python/ray/rllib/models/pytorch/model.py
@@ -0,0 +1,70 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    def __init__(self, obs_space, ac_space, options):
+        super(Model, self).__init__()
+        self._init(obs_space, ac_space, options)
+
+    def _init(self, inputs, num_outputs, options):
+        raise NotImplementedError
+
+    def forward(self, obs):
+        """Forward pass for the model. Internal method - should only
+        be passed PyTorch Tensors.
+
+        PyTorch automatically overloads the given model
+        with this function. Recommended that model(obs)
+        is used instead of model.forward(obs). See
+        https://discuss.pytorch.org/t/any-different-between-model
+        -input-and-model-forward-input/3690
+        """
+        raise NotImplementedError
+
+
+class SlimConv2d(nn.Module):
+    """Simple mock of tf.slim Conv2d"""
+
+    def __init__(self, in_channels, out_channels, kernel, stride, padding,
+                 initializer=nn.init.xavier_uniform,
+                 activation_fn=nn.ReLU, bias_init=0):
+        super(SlimConv2d, self).__init__()
+        layers = []
+        if padding:
+            layers.append(nn.ZeroPad2d(padding))
+        conv = nn.Conv2d(in_channels, out_channels, kernel, stride)
+        if initializer:
+            initializer(conv.weight)
+        nn.init.constant(conv.bias, bias_init)
+
+        layers.append(conv)
+        if activation_fn:
+            layers.append(activation_fn())
+        self._model = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self._model(x)
+
+
+class SlimFC(nn.Module):
+    """Simple PyTorch of `linear` function"""
+
+    def __init__(self, in_size, size, initializer=None,
+                 activation_fn=None, bias_init=0):
+        super(SlimFC, self).__init__()
+        layers = []
+        linear = nn.Linear(in_size, size)
+        if initializer:
+            initializer(linear.weight)
+        nn.init.constant(linear.bias, bias_init)
+        layers.append(linear)
+        if activation_fn:
+            layers.append(activation_fn())
+        self._model = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self._model(x)
diff --git a/python/ray/rllib/models/pytorch/visionnet.py b/python/ray/rllib/models/pytorch/visionnet.py
new file mode 100644
index 000000000..99786a8d4
--- /dev/null
+++ b/python/ray/rllib/models/pytorch/visionnet.py
@@ -0,0 +1,70 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch.nn as nn
+
+from ray.rllib.models.pytorch.model import Model, SlimConv2d, SlimFC
+from ray.rllib.models.pytorch.misc import normc_initializer, valid_padding
+
+
+class VisionNetwork(Model):
+    """Generic vision network"""
+
+    def _init(self, inputs, num_outputs, options):
+        """TF visionnet in PyTorch.
+
+        Params:
+            inputs (tuple): (channels, rows/height, cols/width)
+            num_outputs (int): logits size
+        """
+        filters = options.get("conv_filters", [
+            [16, [8, 8], 4],
+            [32, [4, 4], 2],
+            [512, [10, 10], 1]
+        ])
+        layers = []
+        in_channels, in_size = inputs[0], inputs[1:]
+
+        for out_channels, kernel, stride in filters[:-1]:
+            padding, out_size = valid_padding(
+                in_size, kernel, [stride, stride])
+            layers.append(SlimConv2d(
+                in_channels, out_channels, kernel, stride, padding))
+            in_channels = out_channels
+            in_size = out_size
+
+        out_channels, kernel, stride = filters[-1]
+        layers.append(SlimConv2d(
+                in_channels, out_channels, kernel, stride, None))
+        self._convs = nn.Sequential(*layers)
+
+        self.logits = SlimFC(
+            out_channels, num_outputs, initializer=nn.init.xavier_uniform)
+        self.probs = nn.Softmax()
+        self.value_branch = SlimFC(
+            out_channels, 1, initializer=normc_initializer())
+
+    def hidden_layers(self, obs):
+        """ Internal method - pass in Variables, not numpy arrays
+
+        args:
+            obs: observations and features"""
+        res = self._convs(obs)
+        res = res.squeeze(3)
+        res = res.squeeze(2)
+        return res
+
+    def forward(self, obs):
+        """Internal method. Implements the
+
+        Args:
+            obs (PyTorch): observations and features
+
+        Return:
+            logits (PyTorch): logits to be sampled from for each state
+            value (PyTorch): value function for each state"""
+        res = self.hidden_layers(obs)
+        logits = self.logits(res)
+        value = self.value_branch(res)
+        return logits, value
diff --git a/test/jenkins_tests/run_multi_node_tests.sh b/test/jenkins_tests/run_multi_node_tests.sh
index 97d3e5d30..73971c933 100755
--- a/test/jenkins_tests/run_multi_node_tests.sh
+++ b/test/jenkins_tests/run_multi_node_tests.sh
@@ -125,6 +125,13 @@ docker run --shm-size=10G --memory=10G $DOCKER_SHA \
     --stop '{"training_iteration": 2}' \
     --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "sgd_stepsize": 1e-4, "sgd_batchsize": 64, "timesteps_per_batch": 2000, "num_workers": 1, "model": {"dim": 40, "conv_filters": [[16, [8, 8], 4], [32, [4, 4], 2], [512, [5, 5], 1]]}, "extra_frameskip": 4}'
 
+docker run --shm-size=10G --memory=10G $DOCKER_SHA \
+    python /ray/python/ray/rllib/train.py \
+    --env PongDeterministic-v4 \
+    --alg A3C \
+    --stop '{"training_iteration": 2}' \
+    --config '{"num_workers": 2, "use_lstm": false, "use_pytorch": true, "model": {"grayscale": true, "zero_mean": false, "dim": 80, "channel_major": true}}'
+
 docker run --shm-size=10G --memory=10G $DOCKER_SHA \
     python /ray/python/ray/rllib/test/test_checkpoint_restore.py