From 2b7e97e30720f7a81f989f58749a18309adb0eb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9gory=20Ch=C3=A2tel?= <chatel.gregory@gmail.com>
Date: Wed, 4 Jul 2018 13:49:50 +0200
Subject: [PATCH 1/2] Creating the model no matter what the number of GPU is.

---
 train.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/train.py b/train.py
index ff5acfb..dd76d01 100644
--- a/train.py
+++ b/train.py
@@ -263,8 +263,7 @@ if __name__ == '__main__':
     n_batch_train = args.n_batch * max(n_gpu, 1)
     n_updates_total = (n_train // n_batch_train) * args.n_iter
 
-    if n_gpu > 1:
-        dh_model = DoubleHeadModel(args, clf_token, vocab, n_ctx)
+    dh_model = DoubleHeadModel(args, clf_token, vocab, n_ctx)
 
     criterion = nn.CrossEntropyLoss(reduce=False)
     model_opt = OpenAIAdam(dh_model.parameters(),

From be407cdd37099324a0aa9fb7619cc1db329fd09d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9gory=20Ch=C3=A2tel?= <chatel.gregory@gmail.com>
Date: Wed, 4 Jul 2018 13:50:19 +0200
Subject: [PATCH 2/2] Solving missing variable issue.

---
 model_pytorch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/model_pytorch.py b/model_pytorch.py
index 279dcf8..d6b7dba 100644
--- a/model_pytorch.py
+++ b/model_pytorch.py
@@ -177,6 +177,7 @@ class LMHead(nn.Module):
     def __init__(self, model, cfg):
         super(LMHead, self).__init__()
         self.n_embd = cfg.n_embd
+        embed_shape = model.embed.weight.shape
         self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.decoder.weight = model.embed.weight # Tied weights