diff --git a/pts/model/causal_deepar/causal_deepar_estimator.py b/pts/model/causal_deepar/causal_deepar_estimator.py
index 55cc04a..91d02ae 100644
--- a/pts/model/causal_deepar/causal_deepar_estimator.py
+++ b/pts/model/causal_deepar/causal_deepar_estimator.py
@@ -26,7 +26,7 @@ from gluonts.transform import (
     TestSplitSampler,
     ExpectedNumInstanceSampler,
 )
-from gluonts.torch.support.util import copy_parameters
+from gluonts.torch.util import copy_parameters
 from gluonts.torch.model.predictor import PyTorchPredictor
 from gluonts.torch.modules.distribution_output import DistributionOutput
 from gluonts.model.predictor import Predictor
diff --git a/pts/model/deepar/deepar_estimator.py b/pts/model/deepar/deepar_estimator.py
index 89a03ec..eb0d673 100644
--- a/pts/model/deepar/deepar_estimator.py
+++ b/pts/model/deepar/deepar_estimator.py
@@ -26,7 +26,7 @@ from gluonts.transform import (
     TestSplitSampler,
     ExpectedNumInstanceSampler,
 )
-from gluonts.torch.support.util import copy_parameters
+from gluonts.torch.util import copy_parameters
 from gluonts.torch.model.predictor import PyTorchPredictor
 from gluonts.torch.modules.distribution_output import DistributionOutput
 from gluonts.model.predictor import Predictor
diff --git a/pts/model/deepvar/deepvar_estimator.py b/pts/model/deepvar/deepvar_estimator.py
index 456b57a..2871912 100644
--- a/pts/model/deepvar/deepvar_estimator.py
+++ b/pts/model/deepvar/deepvar_estimator.py
@@ -7,7 +7,7 @@ from gluonts.core.component import validated
 from gluonts.dataset.field_names import FieldName
 from gluonts.time_feature import TimeFeature
 from gluonts.torch.modules.distribution_output import DistributionOutput
-from gluonts.torch.support.util import copy_parameters
+from gluonts.torch.util import copy_parameters
 from gluonts.torch.model.predictor import PyTorchPredictor
 from gluonts.model.predictor import Predictor
 from gluonts.transform import (
diff --git a/pts/model/lstnet/lstnet_estimator.py b/pts/model/lstnet/lstnet_estimator.py
index e98981b..e6d489f 100644
--- a/pts/model/lstnet/lstnet_estimator.py
+++ b/pts/model/lstnet/lstnet_estimator.py
@@ -6,7 +6,7 @@ import torch.nn as nn
 
 from gluonts.core.component import validated
 from gluonts.dataset.field_names import FieldName
-from gluonts.torch.support.util import copy_parameters
+from gluonts.torch.util import copy_parameters
 from gluonts.torch.model.predictor import PyTorchPredictor
 from gluonts.model.predictor import Predictor
 from gluonts.transform import (
@@ -32,7 +32,7 @@ class LSTNetEstimator(PyTorchEstimator):
     def __init__(
         self,
         freq: str,
-        prediction_length: int,
+        prediction_length: Optional[int],
         context_length: int,
         num_series: int,
         ar_window: int = 24,
diff --git a/pts/model/n_beats/n_beats_estimator.py b/pts/model/n_beats/n_beats_estimator.py
index d83a075..40f3e5b 100644
--- a/pts/model/n_beats/n_beats_estimator.py
+++ b/pts/model/n_beats/n_beats_estimator.py
@@ -7,7 +7,7 @@ from gluonts.core.component import validated
 from gluonts.dataset.field_names import FieldName
 from gluonts.model.predictor import Predictor
 from gluonts.torch.model.predictor import PyTorchPredictor
-from gluonts.torch.support.util import copy_parameters
+from gluonts.torch.util import copy_parameters
 from gluonts.transform import (
     InstanceSplitter,
     ValidationSplitSampler,
diff --git a/pts/model/simple_feedforward/simple_feedforward_estimator.py b/pts/model/simple_feedforward/simple_feedforward_estimator.py
index 37b738c..1275d40 100644
--- a/pts/model/simple_feedforward/simple_feedforward_estimator.py
+++ b/pts/model/simple_feedforward/simple_feedforward_estimator.py
@@ -4,7 +4,7 @@ import torch
 import torch.nn as nn
 
 from gluonts.core.component import validated
-from gluonts.torch.support.util import copy_parameters
+from gluonts.torch.util import copy_parameters
 from gluonts.torch.model.predictor import PyTorchPredictor
 from gluonts.torch.modules.distribution_output import DistributionOutput
 from gluonts.model.predictor import Predictor
diff --git a/pts/model/tempflow/tempflow_estimator.py b/pts/model/tempflow/tempflow_estimator.py
index f989ebd..85d65cc 100644
--- a/pts/model/tempflow/tempflow_estimator.py
+++ b/pts/model/tempflow/tempflow_estimator.py
@@ -6,7 +6,7 @@ from gluonts.core.component import validated
 from gluonts.dataset.field_names import FieldName
 from gluonts.time_feature import TimeFeature
 from gluonts.torch.model.predictor import PyTorchPredictor
-from gluonts.torch.support.util import copy_parameters
+from gluonts.torch.util import copy_parameters
 from gluonts.model.predictor import Predictor
 from gluonts.torch.model.predictor import PyTorchPredictor
 from gluonts.transform import (
diff --git a/pts/model/tft/tft_estimator.py b/pts/model/tft/tft_estimator.py
index 66d1c52..c8ca91d 100644
--- a/pts/model/tft/tft_estimator.py
+++ b/pts/model/tft/tft_estimator.py
@@ -12,7 +12,7 @@ from gluonts.time_feature import (
     time_features_from_frequency_str,
 )
 from gluonts.torch.model.predictor import PyTorchPredictor
-from gluonts.torch.support.util import copy_parameters
+from gluonts.torch.util import copy_parameters
 from gluonts.transform import (
     Transformation,
     Chain,
diff --git a/pts/model/time_grad/time_grad_estimator.py b/pts/model/time_grad/time_grad_estimator.py
index 9df86b4..955d3e6 100644
--- a/pts/model/time_grad/time_grad_estimator.py
+++ b/pts/model/time_grad/time_grad_estimator.py
@@ -5,7 +5,7 @@ import torch
 from gluonts.dataset.field_names import FieldName
 from gluonts.time_feature import TimeFeature
 from gluonts.torch.model.predictor import PyTorchPredictor
-from gluonts.torch.support.util import copy_parameters
+from gluonts.torch.util import copy_parameters
 from gluonts.model.predictor import Predictor
 from gluonts.transform import (
     Transformation,
diff --git a/pts/model/transformer/transformer_estimator.py b/pts/model/transformer/transformer_estimator.py
index 4adc188..d2df1df 100644
--- a/pts/model/transformer/transformer_estimator.py
+++ b/pts/model/transformer/transformer_estimator.py
@@ -8,7 +8,7 @@ from gluonts.core.component import validated
 from gluonts.dataset.field_names import FieldName
 from gluonts.time_feature import TimeFeature
 from gluonts.torch.modules.distribution_output import DistributionOutput
-from gluonts.torch.support.util import copy_parameters
+from gluonts.torch.util import copy_parameters
 from gluonts.torch.model.predictor import PyTorchPredictor
 from gluonts.model.predictor import Predictor
 from gluonts.transform import (
diff --git a/pts/model/transformer_tempflow/transformer_tempflow_estimator.py b/pts/model/transformer_tempflow/transformer_tempflow_estimator.py
index 4346abd..00a997a 100644
--- a/pts/model/transformer_tempflow/transformer_tempflow_estimator.py
+++ b/pts/model/transformer_tempflow/transformer_tempflow_estimator.py
@@ -5,7 +5,7 @@ import torch
 from gluonts.core.component import validated
 from gluonts.dataset.field_names import FieldName
 from gluonts.time_feature import TimeFeature
-from gluonts.torch.support.util import copy_parameters
+from gluonts.torch.util import copy_parameters
 from gluonts.torch.model.predictor import PyTorchPredictor
 from gluonts.model.predictor import Predictor
 from gluonts.transform import (
diff --git a/pts/trainer.py b/pts/trainer.py
index 19c6453..852a04a 100644
--- a/pts/trainer.py
+++ b/pts/trainer.py
@@ -2,7 +2,6 @@ import time
 from typing import List, Optional, Union
 
 from tqdm.auto import tqdm
-import wandb
 
 import torch
 import torch.nn as nn
@@ -23,7 +22,6 @@ class Trainer:
         learning_rate: float = 1e-3,
         weight_decay: float = 1e-6,
         maximum_learning_rate: float = 1e-2,
-        wandb_mode: str = "disabled",
         clip_gradient: Optional[float] = None,
         device: Optional[Union[torch.device, str]] = None,
         **kwargs,
@@ -36,7 +34,6 @@ class Trainer:
         self.maximum_learning_rate = maximum_learning_rate
         self.clip_gradient = clip_gradient
         self.device = device
-        wandb.init(mode=wandb_mode, **kwargs)
 
     def __call__(
         self,
@@ -44,8 +41,6 @@ class Trainer:
         train_iter: DataLoader,
         validation_iter: Optional[DataLoader] = None,
     ) -> None:
-        wandb.watch(net, log="all", log_freq=self.num_batches_per_epoch)
-
         optimizer = Adam(
             net.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
         )
@@ -86,8 +81,6 @@ class Trainer:
                         refresh=False,
                     )
 
-                    wandb.log({"loss": loss.item()})
-
                     loss.backward()
                     if self.clip_gradient is not None:
                         nn.utils.clip_grad_norm_(net.parameters(), self.clip_gradient)
@@ -127,7 +120,6 @@ class Trainer:
                         if self.num_batches_per_epoch == batch_no:
                             break
 
-                    wandb.log({"avg_val_loss": avg_epoch_loss_val})
                 it.close()
 
             # mark epoch end time and log time cost of current epoch
diff --git a/setup.py b/setup.py
index fa2e35e..f78a829 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ setup(
     python_requires=">=3.6",
     install_requires=[
         "torch>=1.8.0",
-        "gluonts>=0.7.0",
+        "gluonts>=0.8.0",
         "holidays",
         "numpy~=1.16",
         "pandas~=1.1",
@@ -24,7 +24,6 @@ setup(
         "tqdm",
         "matplotlib",
         "tensorboard",
-        "wandb",
     ],
     test_suite="tests",
     tests_require=["flake8", "pytest"],
diff --git a/test/model/deepar/test_auxillary_outputs.py b/test/model/deepar/test_auxillary_outputs.py
index 200f514..dfca0f0 100644
--- a/test/model/deepar/test_auxillary_outputs.py
+++ b/test/model/deepar/test_auxillary_outputs.py
@@ -37,7 +37,7 @@ def test_distribution():
     estimator = DeepAREstimator(
         freq=freq,
         prediction_length=prediction_length,
-        input_size=48,
+        input_size=15,
         trainer=Trainer(epochs=1, num_batches_per_epoch=1),
         distr_output=StudentTOutput(),
     )
diff --git a/test/modules/test_implicit_quantile_distr_output.py b/test/modules/test_implicit_quantile_distr_output.py
index dd433c8..917a300 100644
--- a/test/modules/test_implicit_quantile_distr_output.py
+++ b/test/modules/test_implicit_quantile_distr_output.py
@@ -56,7 +56,7 @@ def learn_distribution(
             distr = distr_output.distribution(distr_args)
             loss = -distr.log_prob(sample_label).mean()
             loss.backward()
-            clip_grad_norm_(arg_proj.parameters(), 10.0)
+            #clip_grad_norm_(arg_proj.parameters(), 10.0)
             optimizer.step()
 
             num_batches += 1
@@ -77,7 +77,7 @@ def learn_distribution(
             torch.ones((1, 1, 1)), torch.ones((1, 1)) * 0.1
         )
 
-    return samples.mean(), samples.std(), percentile_10, percentile_90
+    return samples.mean(), samples.std(), percentile_10.squeeze(), percentile_90.squeeze()
 
 
 def test_independent_implicit_quantile() -> None:
@@ -181,7 +181,7 @@ def test_training_with_implicit_quantile_output():
             num_batches_per_epoch=3,
             batch_size=256,
         ),
-        input_size=48,
+        input_size=15,
     )
     deepar_predictor = deepar_estimator.train(dataset.train, num_workers=1)
     forecast_it, ts_it = make_evaluation_predictions(
@@ -224,7 +224,7 @@ def test_instanciation_of_args_proj():
             num_batches_per_epoch=1,
             batch_size=256,
         ),
-        input_size=48,
+        input_size=15,
     )
     assert distr_output.method_calls == 1
     deepar_predictor = deepar_estimator.train(dataset.train, num_workers=1)
@@ -258,7 +258,7 @@ def test_instanciation_of_args_proj():
             num_batches_per_epoch=1,
             batch_size=256,
         ),
-        input_size=48,
+        input_size=15,
     )
     assert distr_output.method_calls == 3
     new_estimator.train(dataset.train, num_workers=1)