diff --git a/neural_processes/data/smart_meter.py b/neural_processes/data/smart_meter.py
index 54ee2fe..2783a24 100644
--- a/neural_processes/data/smart_meter.py
+++ b/neural_processes/data/smart_meter.py
@@ -193,8 +193,8 @@ def get_smartmeter_df(indir=Path('./data/smart-meters-in-london'), max_files=60,
     test_files = [f for f in csv_files if is_test(f)]
     val_files = [f for f in csv_files if is_val(f) and (not is_test(f))]
     train_files = [f for f in csv_files if (not is_val(f)) and (not is_test(f))]
-    print(len(train_files), len(val_files), len(test_files))
-    print(train_files, val_files, test_files)
+    # print(len(train_files), len(val_files), len(test_files))
+    # print(train_files, val_files, test_files)
     assert not set(train_files).intersection(set(test_files), set(val_files))
     assert not set(test_files).intersection(set(val_files))
 
diff --git a/neural_processes/lightning.py b/neural_processes/lightning.py
index ee4a940..76207dd 100644
--- a/neural_processes/lightning.py
+++ b/neural_processes/lightning.py
@@ -57,7 +57,7 @@ class PL_Seq2Seq(pl.LightningModule):
         train_outputs = agg_logs(self.train_logs)
         self.train_logs = []
 
-        print(f"step val {self.trainer.global_step}, {outputs} {train_outputs}")
+        logger.info(f"step val {self.trainer.global_step}, {outputs} {train_outputs}")
 
         # tensorboard_logs_str = {k: f"{v}" for k, v in tensorboard_logs.items()}
         # print(f"step {self.trainer.global_step}, {outputs}")
diff --git a/neural_processes/models/lstm_seqseq.py b/neural_processes/models/lstm_seqseq.py
index 48b7005..5d396ec 100644
--- a/neural_processes/models/lstm_seqseq.py
+++ b/neural_processes/models/lstm_seqseq.py
@@ -38,7 +38,6 @@ class Seq2SeqNet(nn.Module):
     def __init__(self, hparams, _min_std=0.05):
         super().__init__()
         hparams = hparams_power(hparams)
-        print(hparams)
         self.hparams = hparams
         self._min_std = _min_std
 
@@ -107,7 +106,7 @@ class Seq2SeqNet(nn.Module):
 
         if output is not None:
             num_layers = h_out.shape[1]
-            print(cell.max(), h_out.max(), h.max())
+            # print(cell.max(), h_out.max(), h.max())
             h_out += h.mean(1).repeat(1, num_layers, 1)
             cell += h.max(1).repeat(1, num_layers, 1)
 
diff --git a/neural_processes/models/neural_process/lightning.py b/neural_processes/models/neural_process/lightning.py
index 327a1ed..b7e6f33 100644
--- a/neural_processes/models/neural_process/lightning.py
+++ b/neural_processes/models/neural_process/lightning.py
@@ -17,15 +17,15 @@ class PL_NeuralProcess(PL_Seq2Seq):
 
     DEFAULT_ARGS = {
         'dropout': 0.1,
-        'learning_rate': 0.004,
+        'learning_rate': 0.003,
         'attention_dropout': 0.5,
         'batchnorm': False,
         'attention_layers': 2,
         'det_enc_cross_attn_type': 'uniform',
         'det_enc_self_attn_type': 'uniform',
         'latent_enc_self_attn_type': 'uniform',
-        'num_heads_power': 3,
-        'hidden_dim_power': 3,
+        'num_heads_power': 2,
+        'hidden_dim_power': 6,
         'latent_dim_power': 5,
         'n_latent_encoder_layers': 3,
         'n_det_encoder_layers': 3,
diff --git a/neural_processes/models/neural_process/model.py b/neural_processes/models/neural_process/model.py
index d5605af..ae83378 100644
--- a/neural_processes/models/neural_process/model.py
+++ b/neural_processes/models/neural_process/model.py
@@ -309,11 +309,11 @@ class NeuralProcess(nn.Module):
         
         device = next(self.parameters()).device
         
-        if self.hparams.get('bnorm_inputs', True):
-            # https://stackoverflow.com/a/46772183/221742
-            target_x = self.norm_x(target_x)
-            context_x = self.norm_x(context_x)
-            context_y = self.norm_y(context_y)
+        # if self.hparams.get('bnorm_inputs', True):
+        # https://stackoverflow.com/a/46772183/221742
+        target_x = self.norm_x(target_x)
+        context_x = self.norm_x(context_x)
+        context_y = self.norm_y(context_y)
 
         if self._use_rnn:
             # see https://arxiv.org/abs/1910.09323 where x is substituted with h = RNN(x)
diff --git a/neural_processes/modules/attention.py b/neural_processes/modules/attention.py
index 7f60446..f62cfa7 100644
--- a/neural_processes/modules/attention.py
+++ b/neural_processes/modules/attention.py
@@ -63,6 +63,8 @@ class Attention(nn.Module):
                 dropout=dropout,
                 batchnorm=batchnorm,
             )
+        elif self._rep == "lstm":
+            self._lstm = LSTMBlock(x_dim, hidden_dim, batchnorm=batchnorm, dropout=dropout, num_layers=attention_layers)
 
         if attention_type == "uniform":
             self._attention_func = self._uniform_attention
@@ -95,6 +97,9 @@ class Attention(nn.Module):
         if self._rep == "mlp":
             k = self.batch_mlp_k(k)
             q = self.batch_mlp_q(q)
+        elif self._rep == "lstm":
+            k = self.batch_lstm(k)
+            q = self.batch_lstm(q)
         rep = self._attention_func(k, v, q)
         return rep
 
diff --git a/neural_processes/train.py b/neural_processes/train.py
index d53716e..cf748c9 100644
--- a/neural_processes/train.py
+++ b/neural_processes/train.py
@@ -42,6 +42,7 @@ def main(
         val_percent_check=PERCENT_TEST_EXAMPLES,
         checkpoint_callback=checkpoint_callback,
         max_epochs=hparams["max_nb_epochs"],
+        weights_summary='top',
         gpus=-1 if torch.cuda.is_available() else None,
         early_stop_callback=PyTorchLightningPruningCallback(trial, monitor="val_loss")
         if prune
@@ -62,7 +63,7 @@ def objective(trial, PL_MODEL_CLS, name, user_attrs):
     trial = PL_MODEL_CLS.add_suggest(trial)
     [trial.set_user_attr(k, v) for k, v in user_attrs.items()]
 
-    print(dict(number=trial.number, params=trial.params, user_attrs=trial.user_attrs))
+    logger.debug(dict(number=trial.number, params=trial.params, user_attrs=trial.user_attrs))
 
     model, trainer = main(trial, PL_MODEL_CLS=PL_MODEL_CLS, name=name)
 
@@ -71,24 +72,24 @@ def objective(trial, PL_MODEL_CLS, name, user_attrs):
     if len(checkpoints):
         checkpoint = checkpoints[-1]
         device = next(model.parameters()).device
-        print(f"Loading checkpoint {checkpoint}")
+        logger.info(f"Loading checkpoint {checkpoint}")
         model = model.load_from_checkpoint(checkpoint).to(device)
     
     trainer.test(model)
 
     # also report to tensorboard & print
-    print("logger.metrics", model.logger.metrics[-1:])
+    logger.info("logger.metrics", model.logger.metrics[-1:])
     model.logger.experiment.add_hparams(trial.params, model.logger.metrics[-1])
     model.logger.save()
 
-    return model.logger.metrics[-1]["agg_test_loss"]
+    return model.logger.metrics[-1]["agg_test_score"]
 
 
 def add_number(trial: optuna.Trial, model_dir: Path):
     # For manual experiment we will start at -1 and deincr by 1
     versions = [int(s.stem.split("_")[-1]) for s in model_dir.glob("version_*")] + [-1]
     trial.number = min(versions) - 1
-    print("trial.number", trial.number)
+    # logger.debug("trial.number", trial.number)
     return trial
 
 
@@ -101,7 +102,7 @@ def run_trial(
     plot_from_loader=plot_from_loader,
     number=None,
 ):
-    print(f"now run `tensorboard --logdir {MODEL_DIR}`")
+    logger.info(f"now run `tensorboard --logdir {MODEL_DIR}`")
     (MODEL_DIR / name).mkdir(parents=True, exist_ok=True)
 
     if getattr(PL_MODEL_CLS, 'DEFAULT_ARGS', None):
@@ -121,7 +122,7 @@ def run_trial(
 
     # Add user attributes
     [trial.set_user_attr(k, v) for k, v in user_attrs.items()]
-    print('trial', trial.number, trial, trial.params, trial.user_attrs)
+    logger.info('trial number=%s trial=%s params=%s attrs=%s', trial.number, trial, trial.params, trial.user_attrs)
 
     model, trainer = main(
         trial, PL_MODEL_CLS, name=name, MODEL_DIR=MODEL_DIR, train=False, prune=False
@@ -132,7 +133,7 @@ def run_trial(
         try:
             trainer.fit(model)
         except KeyboardInterrupt:
-            print('KeyboardInterrupt, skipping rest of training')
+            logger.warning('KeyboardInterrupt, skipping rest of training')
             pass
 
         # Plot
@@ -151,7 +152,7 @@ def run_trial(
     if len(checkpoints):
         checkpoint = checkpoints[-1]
         device = next(model.parameters()).device
-        print(f"Loading checkpoint {checkpoint}")
+        logger.info(f"Loading checkpoint {checkpoint}")
         model = model.load_from_checkpoint(checkpoint).to(device)
 
         # Plot
@@ -162,11 +163,11 @@ def run_trial(
         plot_from_loader(model.test_dataloader(), model, i=670, title='test 670')
         plt.show()
     else:
-        print('no checkpoints')
+        logger.warning('no checkpoints')
 
     try:
         trainer.test(model)
     except KeyboardInterrupt:
-        print('KeyboardInterrupt, skipping rest of testing')
+        logger.warning('KeyboardInterrupt, skipping rest of testing')
         pass
     return trial, trainer, model