diff --git a/.travis.yml b/.travis.yml
index 3edc3ee..be93c59 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -32,25 +32,39 @@ install:
 
   - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy matplotlib pandas pytest h5py
   - source activate test-environment
-  - pip install pytest-cov python-coveralls pytest-xdist coverage==3.7.1 #we need this version of coverage for coveralls.io to work
+  - pip install pytest-cov pytest-xdist
   - pip install pep8 pytest-pep8
+  - conda install mkl mkl-service
   - pip install theano
   - pip install git+git://github.com/fchollet/keras.git
 
   # install PIL for preprocessing tests
-  #- if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
-  #    conda install pil;
-  #  elif [[ "$TRAVIS_PYTHON_VERSION" == "3.5" ]]; then
-  #    conda install Pillow;
-  #  fi
+  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
+      conda install pil;
+    elif [[ "$TRAVIS_PYTHON_VERSION" == "3.5" ]]; then
+      conda install Pillow;
+    fi
 
-  - python setup.py install
+  - pip install -e .[tests]
 
-  # install TensorFlow (CPU)
+  # install TensorFlow (CPU version).
   - pip install tensorflow
+  
+  # install cntk
+  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
+      pip install https://cntk.ai/PythonWheel/CPU-Only/cntk-2.2-cp27-cp27mu-linux_x86_64.whl;
+    elif [[ "$TRAVIS_PYTHON_VERSION" == "3.5" ]]; then
+      pip install https://cntk.ai/PythonWheel/CPU-Only/cntk-2.2-cp35-cp35m-linux_x86_64.whl;
+    fi
+
+  # install pydot for visualization tests
+  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
+      conda install pydot graphviz;
+    fi
 
 # command to run tests
 script:
+  - export MKL_THREADING_LAYER="GNU"
   # run keras backend init to initialize backend config
   - python -c "import keras.backend"
   # create dataset directory to avoid concurrent directory creation at runtime
@@ -61,7 +75,5 @@ script:
   - if [[ "$TEST_MODE" == "PEP8" ]]; then
        PYTHONPATH=$PWD:$PYTHONPATH py.test --pep8 -m pep8 -n0;
     else
-       PYTHONPATH=$PWD:$PYTHONPATH py.test tests/;
+       PYTHONPATH=$PWD:$PYTHONPATH py.test tests/ --ignore=tests/integration_tests --ignore=tests/test_documentation.py --cov=keras tests/ --cov-report term-missing;
     fi
-after_success:
-  - coveralls
diff --git a/GUIDELINES.md b/GUIDELINES.md
index bb95dcd..df0bed9 100644
--- a/GUIDELINES.md
+++ b/GUIDELINES.md
@@ -2,6 +2,7 @@
 
 ## Maintainers:
 Following are the users with write-access to this repository (maintainers) :
+* [athundt](https://www.github.com/athundt)
 * [bstriner](https://www.github.com/bstriner)
 * [farizrahman4u](https://www.github.com/farizrahman4u)
 * [fchollet](https://www.github.com/fchollet)
diff --git a/examples/cifar10_densenet.py b/examples/cifar10_densenet.py
index 79c6c3d..84e2a33 100644
--- a/examples/cifar10_densenet.py
+++ b/examples/cifar10_densenet.py
@@ -33,8 +33,11 @@ nb_filter = 16
 dropout_rate = 0.0  # 0.0 for data augmentation
 
 # Create the model (without loading weights)
-model = DenseNet(depth, nb_dense_block, growth_rate, nb_filter, dropout_rate=dropout_rate,
-                 input_shape=img_dim, weights=None)
+model = DenseNet(depth=depth, nb_dense_block=nb_dense_block,
+                 growth_rate=growth_rate, nb_filter=nb_filter,
+                 dropout_rate=dropout_rate,
+                 input_shape=img_dim,
+                 weights=None)
 print('Model created')
 
 model.summary()
diff --git a/examples/cifar10_nasnet.py b/examples/cifar10_nasnet.py
new file mode 100644
index 0000000..56c75ee
--- /dev/null
+++ b/examples/cifar10_nasnet.py
@@ -0,0 +1,106 @@
+"""
+Adapted from keras example cifar10_cnn.py
+Train NASNet-CIFAR on the CIFAR10 small images dataset.
+"""
+from __future__ import print_function
+from keras.datasets import cifar10
+from keras.preprocessing.image import ImageDataGenerator
+from keras.utils import np_utils
+from keras.callbacks import ModelCheckpoint
+from keras.callbacks import ReduceLROnPlateau
+from keras.callbacks import CSVLogger
+from keras.optimizers import Adam
+from keras_contrib.applications.nasnet import NASNetCIFAR, preprocess_input
+
+import numpy as np
+
+
+weights_file = 'NASNet-CIFAR-10.h5'
+lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.5), cooldown=0, patience=5, min_lr=0.5e-5)
+csv_logger = CSVLogger('NASNet-CIFAR-10.csv')
+model_checkpoint = ModelCheckpoint(weights_file, monitor='val_predictions_acc', save_best_only=True,
+                                   save_weights_only=True, mode='max')
+
+batch_size = 128
+nb_classes = 10
+nb_epoch = 600
+data_augmentation = True
+
+# input image dimensions
+img_rows, img_cols = 32, 32
+# The CIFAR10 images are RGB.
+img_channels = 3
+
+# The data, shuffled and split between train and test sets:
+(X_train, y_train), (X_test, y_test) = cifar10.load_data()
+
+# Convert class vectors to binary class matrices.
+Y_train = np_utils.to_categorical(y_train, nb_classes)
+Y_test = np_utils.to_categorical(y_test, nb_classes)
+
+X_train = X_train.astype('float32')
+X_test = X_test.astype('float32')
+
+# preprocess input
+X_train = preprocess_input(X_train)
+X_test = preprocess_input(X_test)
+
+# For training, the auxilary branch must be used to correctly train NASNet
+model = NASNetCIFAR((img_rows, img_cols, img_channels), use_auxilary_branch=True)
+model.summary()
+
+optimizer = Adam(lr=1e-3, clipnorm=5)
+model.compile(loss=['categorical_crossentropy', 'categorical_crossentropy'],
+              optimizer=optimizer, metrics=['accuracy'], loss_weights=[1.0, 0.4])
+
+# model.load_weights('NASNet-CIFAR-10.h5', by_name=True)
+
+if not data_augmentation:
+    print('Not using data augmentation.')
+    model.fit(X_train, [Y_train, Y_train],
+              batch_size=batch_size,
+              epochs=nb_epoch,
+              validation_data=(X_test, [Y_test, Y_test]),
+              shuffle=True,
+              verbose=2,
+              callbacks=[lr_reducer, csv_logger, model_checkpoint])
+else:
+    print('Using real-time data augmentation.')
+    # This will do preprocessing and realtime data augmentation:
+    datagen = ImageDataGenerator(
+        featurewise_center=False,  # set input mean to 0 over the dataset
+        samplewise_center=False,  # set each sample mean to 0
+        featurewise_std_normalization=False,  # divide inputs by std of the dataset
+        samplewise_std_normalization=False,  # divide each input by its std
+        zca_whitening=False,  # apply ZCA whitening
+        rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
+        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
+        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
+        horizontal_flip=True,  # randomly flip images
+        vertical_flip=False)  # randomly flip images
+
+    # Compute quantities required for featurewise normalization
+    # (std, mean, and principal components if ZCA whitening is applied).
+    datagen.fit(X_train)
+
+    # wrap the ImageDataGenerator to yield two label batches [y, y] for each input batch X
+    # When training a NASNet model, we have to use its auxilary training head
+    # Therefore the model is technically a 1 input - 2 output model, and requires
+    # the label to be duplicated for the auxilary head
+    def image_data_generator_wrapper(image_datagenerator, batch_size):
+        iterator = datagen.flow(X_train, Y_train, batch_size=batch_size)
+
+        while True:
+            X, y = next(iterator)  # get the next batch
+            yield X, [y, y]  # duplicate the labels for each batch
+
+    # Fit the model on the batches generated by datagen.flow().
+    model.fit_generator(image_data_generator_wrapper(datagen, batch_size),
+                        steps_per_epoch=X_train.shape[0] // batch_size,
+                        validation_data=(X_test, [Y_test, Y_test]),
+                        epochs=nb_epoch, verbose=2,
+                        callbacks=[lr_reducer, csv_logger, model_checkpoint])
+
+scores = model.evaluate(X_test, [Y_test, Y_test], batch_size=batch_size)
+for score, metric_name in zip(scores, model.metrics_names):
+    print("%s : %0.4f" % (metric_name, score))
diff --git a/examples/cifar10_resnet.py b/examples/cifar10_resnet.py
new file mode 100644
index 0000000..edb6384
--- /dev/null
+++ b/examples/cifar10_resnet.py
@@ -0,0 +1,96 @@
+"""
+Adapted from keras example cifar10_cnn.py and github.com/raghakot/keras-resnet
+Train ResNet-18 on the CIFAR10 small images dataset.
+
+GPU run command with Theano backend (with TensorFlow, the GPU is automatically used):
+    THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python cifar10.py
+"""
+from __future__ import print_function
+from keras.datasets import cifar10
+from keras.preprocessing.image import ImageDataGenerator
+from keras.utils import np_utils
+from keras.callbacks import ModelCheckpoint
+from keras.callbacks import ReduceLROnPlateau
+from keras.callbacks import CSVLogger
+from keras.callbacks import EarlyStopping
+from keras_contrib.applications.resnet import ResNet18
+
+import numpy as np
+
+
+weights_file = 'ResNet18v2-CIFAR-10.h5'
+lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=5, min_lr=0.5e-6)
+early_stopper = EarlyStopping(min_delta=0.001, patience=10)
+csv_logger = CSVLogger('ResNet18v2-CIFAR-10.csv')
+model_checkpoint = ModelCheckpoint(weights_file, monitor='val_acc', save_best_only=True,
+                                   save_weights_only=True, mode='auto')
+
+batch_size = 32
+nb_classes = 10
+nb_epoch = 200
+data_augmentation = True
+
+# input image dimensions
+img_rows, img_cols = 32, 32
+# The CIFAR10 images are RGB.
+img_channels = 3
+
+# The data, shuffled and split between train and test sets:
+(X_train, y_train), (X_test, y_test) = cifar10.load_data()
+
+# Convert class vectors to binary class matrices.
+Y_train = np_utils.to_categorical(y_train, nb_classes)
+Y_test = np_utils.to_categorical(y_test, nb_classes)
+
+X_train = X_train.astype('float32')
+X_test = X_test.astype('float32')
+
+# subtract mean and normalize
+mean_image = np.mean(X_train, axis=0)
+X_train -= mean_image
+X_test -= mean_image
+X_train /= 128.
+X_test /= 128.
+
+model = ResNet18((img_rows, img_cols, img_channels), nb_classes)
+model.compile(loss='categorical_crossentropy',
+              optimizer='adam',
+              metrics=['accuracy'])
+
+if not data_augmentation:
+    print('Not using data augmentation.')
+    model.fit(X_train, Y_train,
+              batch_size=batch_size,
+              nb_epoch=nb_epoch,
+              validation_data=(X_test, Y_test),
+              shuffle=True,
+              callbacks=[lr_reducer, early_stopper, csv_logger, model_checkpoint])
+else:
+    print('Using real-time data augmentation.')
+    # This will do preprocessing and realtime data augmentation:
+    datagen = ImageDataGenerator(
+        featurewise_center=False,  # set input mean to 0 over the dataset
+        samplewise_center=False,  # set each sample mean to 0
+        featurewise_std_normalization=False,  # divide inputs by std of the dataset
+        samplewise_std_normalization=False,  # divide each input by its std
+        zca_whitening=False,  # apply ZCA whitening
+        rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
+        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
+        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
+        horizontal_flip=True,  # randomly flip images
+        vertical_flip=False)  # randomly flip images
+
+    # Compute quantities required for featurewise normalization
+    # (std, mean, and principal components if ZCA whitening is applied).
+    datagen.fit(X_train)
+
+    # Fit the model on the batches generated by datagen.flow().
+    model.fit_generator(datagen.flow(X_train, Y_train, batch_size=batch_size),
+                        steps_per_epoch=X_train.shape[0] // batch_size,
+                        validation_data=(X_test, Y_test),
+                        epochs=nb_epoch, verbose=2,
+                        callbacks=[lr_reducer, early_stopper, csv_logger, model_checkpoint])
+
+scores = model.evaluate(X_test, Y_test, batch_size=batch_size)
+print('Test loss : ', scores[0])
+print('Test accuracy : ', scores[1])
diff --git a/keras_contrib/applications/__init__.py b/keras_contrib/applications/__init__.py
index e9d829d..a1592a7 100644
--- a/keras_contrib/applications/__init__.py
+++ b/keras_contrib/applications/__init__.py
@@ -1,2 +1,5 @@
 from .densenet import DenseNet
 from .ror import ResidualOfResidual
+from .resnet import ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152
+from .wide_resnet import WideResidualNetwork
+from .nasnet import NASNet, NASNetLarge, NASNetMobile
diff --git a/keras_contrib/applications/densenet.py b/keras_contrib/applications/densenet.py
index ff72ceb..b885703 100644
--- a/keras_contrib/applications/densenet.py
+++ b/keras_contrib/applications/densenet.py
@@ -1,8 +1,46 @@
 # -*- coding: utf-8 -*-
-'''DenseNet models for Keras.
+'''DenseNet and DenseNet-FCN models for Keras.
+
+DenseNet is a network architecture where each layer is directly connected
+to every other layer in a feed-forward fashion (within each dense block).
+For each layer, the feature maps of all preceding layers are treated as
+separate inputs whereas its own feature maps are passed on as inputs to
+all subsequent layers. This connectivity pattern yields state-of-the-art
+accuracies on CIFAR10/100 (with or without data augmentation) and SVHN.
+On the large scale ILSVRC 2012 (ImageNet) dataset, DenseNet achieves a
+similar accuracy as ResNet, but using less than half the amount of
+parameters and roughly half the number of FLOPs.
+
+DenseNets support any input image size of 32x32 or greater, and are thus
+suited for CIFAR-10 or CIFAR-100 datasets. There are two types of DenseNets,
+one suited for smaller images (DenseNet) and one suited for ImageNet,
+called DenseNetImageNet. They are differentiated by the strided convolution
+and pooling operations prior to the initial dense block.
+
+The following table describes the size and accuracy of DenseNetImageNet models
+on the ImageNet dataset (single crop), for which weights are provided:
+------------------------------------------------------------------------------------
+    Model type      | ImageNet Acc (Top 1)  |  ImageNet Acc (Top 5) |  Params (M)  |
+------------------------------------------------------------------------------------
+|   DenseNet-121    |    25.02 %            |        7.71 %         |     8.0      |
+|   DenseNet-169    |    23.80 %            |        6.85 %         |     14.3     |
+|   DenseNet-201    |    22.58 %            |        6.34 %         |     20.2     |
+|   DenseNet-161    |    22.20 %            |         -   %         |     28.9     |
+------------------------------------------------------------------------------------
+
+DenseNets can be extended to image segmentation tasks as described in the
+paper "The One Hundred Layers Tiramisu: Fully Convolutional DenseNets for
+Semantic Segmentation". Here, the dense blocks are arranged and concatenated
+with long skip connections for state of the art performance on the CamVid dataset.
+
 # Reference
 - [Densely Connected Convolutional Networks](https://arxiv.org/pdf/1608.06993.pdf)
 - [The One Hundred Layers Tiramisu: Fully Convolutional DenseNets for Semantic Segmentation](https://arxiv.org/pdf/1611.09326.pdf)
+
+This implementation is based on the following reference code:
+ - https://github.com/gpleiss/efficient_densenet_pytorch
+ - https://github.com/liuzhuang13/DenseNet
+
 '''
 from __future__ import print_function
 from __future__ import absolute_import
@@ -11,89 +49,147 @@ from __future__ import division
 import warnings
 
 from keras.models import Model
-from keras.layers.core import Dense, Dropout, Activation, Reshape
-from keras.layers.convolutional import Conv2D, Conv2DTranspose, UpSampling2D
-from keras.layers.pooling import AveragePooling2D
-from keras.layers.pooling import GlobalAveragePooling2D
+from keras.layers import Dense
+from keras.layers import Dropout
+from keras.layers import Activation
+from keras.layers import Reshape
+from keras.layers import Conv2D
+from keras.layers import Conv2DTranspose
+from keras.layers import UpSampling2D
+from keras.layers import MaxPooling2D
+from keras.layers import AveragePooling2D
+from keras.layers import GlobalMaxPooling2D
+from keras.layers import GlobalAveragePooling2D
 from keras.layers import Input
-from keras.layers.merge import concatenate
-from keras.layers.normalization import BatchNormalization
+from keras.layers import concatenate
+from keras.layers import BatchNormalization
 from keras.regularizers import l2
 from keras.utils.layer_utils import convert_all_kernels_in_model
 from keras.utils.data_utils import get_file
 from keras.engine.topology import get_source_inputs
 from keras.applications.imagenet_utils import _obtain_input_shape
+from keras.applications.imagenet_utils import decode_predictions
+from keras.applications.imagenet_utils import preprocess_input as _preprocess_input
 import keras.backend as K
 
 from keras_contrib.layers.convolutional import SubPixelUpscaling
 
-TH_WEIGHTS_PATH = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Theano-Backend-TH-dim-ordering.h5'
-TF_WEIGHTS_PATH = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Tensorflow-Backend-TF-dim-ordering.h5'
-TH_WEIGHTS_PATH_NO_TOP = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Theano-Backend-TH-dim-ordering-no-top.h5'
-TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Tensorflow-Backend-TF-dim-ordering-no-top.h5'
+DENSENET_121_WEIGHTS_PATH = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-121-32.h5'
+DENSENET_161_WEIGHTS_PATH = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-161-48.h5'
+DENSENET_169_WEIGHTS_PATH = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-169-32.h5'
+DENSENET_121_WEIGHTS_PATH_NO_TOP = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-121-32-no-top.h5'
+DENSENET_161_WEIGHTS_PATH_NO_TOP = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-161-48-no-top.h5'
+DENSENET_169_WEIGHTS_PATH_NO_TOP = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-169-32-no-top.h5'
 
 
-def DenseNet(input_shape=None, depth=40, nb_dense_block=3, growth_rate=12, nb_filter=16, nb_layers_per_block=-1,
-             bottleneck=False, reduction=0.0, dropout_rate=0.0, weight_decay=1E-4,
-             include_top=True, weights='cifar10', input_tensor=None,
-             classes=10, activation='softmax'):
-    '''Instantiate the DenseNet architecture,
-        optionally loading weights pre-trained
-        on CIFAR-10. Note that when using TensorFlow,
-        for best performance you should set
-        `image_data_format='channels_last'` in your Keras config
-        at ~/.keras/keras.json.
-        The model and the weights are compatible with both
-        TensorFlow and Theano. The dimension ordering
-        convention used by the model is the one
-        specified in your Keras config file.
-        # Arguments
-            input_shape: optional shape tuple, only to be specified
-                if `include_top` is False (otherwise the input shape
-                has to be `(32, 32, 3)` (with `channels_last` dim ordering)
-                or `(3, 32, 32)` (with `channels_first` dim ordering).
-                It should have exactly 3 inputs channels,
-                and width and height should be no smaller than 8.
-                E.g. `(200, 200, 3)` would be one valid value.
-            depth: number or layers in the DenseNet
-            nb_dense_block: number of dense blocks to add to end (generally = 3)
-            growth_rate: number of filters to add per dense block
-            nb_filter: initial number of filters. -1 indicates initial
-                number of filters is 2 * growth_rate
-            nb_layers_per_block: number of layers in each dense block.
-                Can be a -1, positive integer or a list.
-                If -1, calculates nb_layer_per_block from the network depth.
-                If positive integer, a set number of layers per dense block.
-                If list, nb_layer is used as provided. Note that list size must
-                be (nb_dense_block + 1)
-            bottleneck: flag to add bottleneck blocks in between dense blocks
-            reduction: reduction factor of transition blocks.
-                Note : reduction value is inverted to compute compression.
-            dropout_rate: dropout rate
-            weight_decay: weight decay factor
-            include_top: whether to include the fully-connected
-                layer at the top of the network.
-            weights: one of `None` (random initialization) or
-                'cifar10' (pre-training on CIFAR-10)..
-            input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
-                to use as image input for the model.
-            classes: optional number of classes to classify images
-                into, only to be specified if `include_top` is True, and
-                if no `weights` argument is specified.
-            activation: Type of activation at the top layer. Can be one of 'softmax' or 'sigmoid'.
-                Note that if sigmoid is used, classes must be 1.
-        # Returns
-            A Keras model instance.
-        '''
+def preprocess_input(x, data_format=None):
+    """Preprocesses a tensor encoding a batch of images.
 
-    if weights not in {'cifar10', None}:
+    # Arguments
+        x: input Numpy tensor, 4D.
+        data_format: data format of the image tensor.
+
+    # Returns
+        Preprocessed tensor.
+    """
+    x = _preprocess_input(x, data_format=data_format)
+    x *= 0.017  # scale values
+    return x
+
+
+def DenseNet(input_shape=None,
+             depth=40,
+             nb_dense_block=3,
+             growth_rate=12,
+             nb_filter=-1,
+             nb_layers_per_block=-1,
+             bottleneck=False,
+             reduction=0.0,
+             dropout_rate=0.0,
+             weight_decay=1e-4,
+             subsample_initial_block=False,
+             include_top=True,
+             weights=None,
+             input_tensor=None,
+             pooling=None,
+             classes=10,
+             activation='softmax'):
+    '''Instantiate the DenseNet architecture.
+
+    The model and the weights are compatible with both
+    TensorFlow and Theano. The dimension ordering
+    convention used by the model is the one
+    specified in your Keras config file.
+
+    # Arguments
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(224, 224, 3)` (with `channels_last` dim ordering)
+            or `(3, 224, 224)` (with `channels_first` dim ordering).
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 8.
+            E.g. `(224, 224, 3)` would be one valid value.
+        depth: number or layers in the DenseNet
+        nb_dense_block: number of dense blocks to add to end
+        growth_rate: number of filters to add per dense block
+        nb_filter: initial number of filters. -1 indicates initial
+            number of filters will default to 2 * growth_rate
+        nb_layers_per_block: number of layers in each dense block.
+            Can be a -1, positive integer or a list.
+            If -1, calculates nb_layer_per_block from the network depth.
+            If positive integer, a set number of layers per dense block.
+            If list, nb_layer is used as provided. Note that list size must
+            be nb_dense_block
+        bottleneck: flag to add bottleneck blocks in between dense blocks
+        reduction: reduction factor of transition blocks.
+            Note : reduction value is inverted to compute compression.
+        dropout_rate: dropout rate
+        weight_decay: weight decay rate
+        subsample_initial_block: Changes model type to suit different datasets.
+            Should be set to True for ImageNet, and False for CIFAR datasets.
+            When set to True, the initial convolution will be strided and
+            adds a MaxPooling2D before the initial dense block.
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: one of `None` (random initialization) or
+            'imagenet' (pre-training on ImageNet)..
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        activation: Type of activation at the top layer. Can be one of
+            'softmax' or 'sigmoid'. Note that if sigmoid is used,
+             classes must be 1.
+
+    # Returns
+        A Keras model instance.
+
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+    '''
+
+    if weights not in {'imagenet', None}:
         raise ValueError('The `weights` argument should be either '
-                         '`None` (random initialization) or `cifar10` '
-                         '(pre-training on CIFAR-10).')
+                         '`None` (random initialization) or `imagenet` '
+                         '(pre-training on ImageNet).')
 
-    if weights == 'cifar10' and include_top and classes != 10:
-        raise ValueError('If using `weights` as CIFAR 10 with `include_top`'
-                         ' as true, `classes` should be 10')
+    if weights == 'imagenet' and include_top and classes != 1000:
+        raise ValueError('If using `weights` as ImageNet with `include_top` '
+                         'as true, `classes` should be 1000')
 
     if activation not in ['softmax', 'sigmoid']:
         raise ValueError('activation must be one of "softmax" or "sigmoid"')
@@ -106,7 +202,7 @@ def DenseNet(input_shape=None, depth=40, nb_dense_block=3, growth_rate=12, nb_fi
                                       default_size=32,
                                       min_size=8,
                                       data_format=K.image_data_format(),
-                                      include_top=include_top)
+                                      require_flatten=include_top)
 
     if input_tensor is None:
         img_input = Input(shape=input_shape)
@@ -117,8 +213,9 @@ def DenseNet(input_shape=None, depth=40, nb_dense_block=3, growth_rate=12, nb_fi
             img_input = input_tensor
 
     x = __create_dense_net(classes, img_input, include_top, depth, nb_dense_block,
-                           growth_rate, nb_filter, nb_layers_per_block, bottleneck, reduction,
-                           dropout_rate, weight_decay, activation)
+                           growth_rate, nb_filter, nb_layers_per_block, bottleneck,
+                           reduction, dropout_rate, weight_decay, subsample_initial_block,
+                           pooling, activation)
 
     # Ensure that the model takes into account
     # any potential predecessors of `input_tensor`.
@@ -130,47 +227,69 @@ def DenseNet(input_shape=None, depth=40, nb_dense_block=3, growth_rate=12, nb_fi
     model = Model(inputs, x, name='densenet')
 
     # load weights
-    if weights == 'cifar10':
-        if (depth == 40) and (nb_dense_block == 3) and (growth_rate == 12) and (nb_filter == 16) and \
-                (bottleneck is False) and (reduction == 0.0) and (dropout_rate == 0.0) and (weight_decay == 1E-4):
-            # Default parameters match. Weights for this model exist:
+    if weights == 'imagenet':
+        weights_loaded = False
 
-            if K.image_data_format() == 'channels_first':
-                if include_top:
-                    weights_path = get_file('densenet_40_12_th_dim_ordering_th_kernels.h5',
-                                            TH_WEIGHTS_PATH,
-                                            cache_subdir='models')
-                else:
-                    weights_path = get_file('densenet_40_12_th_dim_ordering_th_kernels_no_top.h5',
-                                            TH_WEIGHTS_PATH_NO_TOP,
-                                            cache_subdir='models')
-
-                model.load_weights(weights_path)
-
-                if K.backend() == 'tensorflow':
-                    warnings.warn('You are using the TensorFlow backend, yet you '
-                                  'are using the Theano '
-                                  'image dimension ordering convention '
-                                  '(`image_data_format="channels_first"`). '
-                                  'For best performance, set '
-                                  '`image_data_format="channels_last"` in '
-                                  'your Keras config '
-                                  'at ~/.keras/keras.json.')
-                    convert_all_kernels_in_model(model)
+        if (depth == 121) and (nb_dense_block == 4) and (growth_rate == 32) and (nb_filter == 64) and \
+                (bottleneck is True) and (reduction == 0.5) and (subsample_initial_block):
+            if include_top:
+                weights_path = get_file('DenseNet-BC-121-32.h5',
+                                        DENSENET_121_WEIGHTS_PATH,
+                                        cache_subdir='models',
+                                        md5_hash='a439dd41aa672aef6daba4ee1fd54abd')
             else:
-                if include_top:
-                    weights_path = get_file('densenet_40_12_tf_dim_ordering_tf_kernels.h5',
-                                            TF_WEIGHTS_PATH,
-                                            cache_subdir='models')
-                else:
-                    weights_path = get_file('densenet_40_12_tf_dim_ordering_tf_kernels_no_top.h5',
-                                            TF_WEIGHTS_PATH_NO_TOP,
-                                            cache_subdir='models')
+                weights_path = get_file('DenseNet-BC-121-32-no-top.h5',
+                                        DENSENET_121_WEIGHTS_PATH_NO_TOP,
+                                        cache_subdir='models',
+                                        md5_hash='55e62a6358af8a0af0eedf399b5aea99')
+            model.load_weights(weights_path, by_name=True)
+            weights_loaded = True
 
-                model.load_weights(weights_path)
+        if (depth == 161) and (nb_dense_block == 4) and (growth_rate == 48) and (nb_filter == 96) and \
+                (bottleneck is True) and (reduction == 0.5) and (subsample_initial_block):
+            if include_top:
+                weights_path = get_file('DenseNet-BC-161-48.h5',
+                                        DENSENET_161_WEIGHTS_PATH,
+                                        cache_subdir='models',
+                                        md5_hash='6c326cf4fbdb57d31eff04333a23fcca')
+            else:
+                weights_path = get_file('DenseNet-BC-161-48-no-top.h5',
+                                        DENSENET_161_WEIGHTS_PATH_NO_TOP,
+                                        cache_subdir='models',
+                                        md5_hash='1a9476b79f6b7673acaa2769e6427b92')
+            model.load_weights(weights_path, by_name=True)
+            weights_loaded = True
 
-                if K.backend() == 'theano':
-                    convert_all_kernels_in_model(model)
+        if (depth == 169) and (nb_dense_block == 4) and (growth_rate == 32) and (nb_filter == 64) and \
+                (bottleneck is True) and (reduction == 0.5) and (subsample_initial_block):
+            if include_top:
+                weights_path = get_file('DenseNet-BC-169-32.h5',
+                                        DENSENET_169_WEIGHTS_PATH,
+                                        cache_subdir='models',
+                                        md5_hash='914869c361303d2e39dec640b4e606a6')
+            else:
+                weights_path = get_file('DenseNet-BC-169-32-no-top.h5',
+                                        DENSENET_169_WEIGHTS_PATH_NO_TOP,
+                                        cache_subdir='models',
+                                        md5_hash='89c19e8276cfd10585d5fadc1df6859e')
+            model.load_weights(weights_path, by_name=True)
+            weights_loaded = True
+
+        if weights_loaded:
+            if K.backend() == 'theano':
+                convert_all_kernels_in_model(model)
+
+            if K.image_data_format() == 'channels_first' and K.backend() == 'tensorflow':
+                warnings.warn('You are using the TensorFlow backend, yet you '
+                              'are using the Theano '
+                              'image data format convention '
+                              '(`image_data_format="channels_first"`). '
+                              'For best performance, set '
+                              '`image_data_format="channels_last"` in '
+                              'your Keras config '
+                              'at ~/.keras/keras.json.')
+
+            print("Weights for the model were loaded successfully")
 
     return model
 
@@ -297,135 +416,297 @@ def DenseNetFCN(input_shape, nb_dense_block=5, growth_rate=16, nb_layers_per_blo
     return model
 
 
-def __conv_block(ip, nb_filter, bottleneck=False, dropout_rate=None, weight_decay=1E-4):
-    ''' Apply BatchNorm, Relu, 3x3 Conv2D, optional bottleneck block and dropout
-    Args:
-        ip: Input keras tensor
-        nb_filter: number of filters
-        bottleneck: add bottleneck block
+def DenseNetImageNet121(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights='imagenet',
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=121, nb_dense_block=4, growth_rate=32, nb_filter=64,
+                    nb_layers_per_block=[6, 12, 24, 16], bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True,
+                    include_top=include_top, weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+
+
+def DenseNetImageNet169(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights='imagenet',
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=169, nb_dense_block=4, growth_rate=32, nb_filter=64,
+                    nb_layers_per_block=[6, 12, 32, 32], bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True,
+                    include_top=include_top, weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+
+
+def DenseNetImageNet201(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights=None,
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=201, nb_dense_block=4, growth_rate=32, nb_filter=64,
+                    nb_layers_per_block=[6, 12, 48, 32], bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True,
+                    include_top=include_top, weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+
+
+def DenseNetImageNet264(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights=None,
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=201, nb_dense_block=4, growth_rate=32, nb_filter=64,
+                    nb_layers_per_block=[6, 12, 64, 48], bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True,
+                    include_top=include_top, weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+
+
+def DenseNetImageNet161(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights='imagenet',
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=161, nb_dense_block=4, growth_rate=48, nb_filter=96,
+                    nb_layers_per_block=[6, 12, 36, 24], bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True,
+                    include_top=include_top, weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+
+
+def name_or_none(prefix, name):
+    return prefix + name if (prefix is not None and name is not None) else None
+
+
+def __conv_block(ip, nb_filter, bottleneck=False, dropout_rate=None, weight_decay=1e-4, block_prefix=None):
+    '''
+    Adds a convolution layer (with batch normalization and relu),
+    and optionally a bottleneck layer.
+
+    # Arguments
+        ip: Input tensor
+        nb_filter: integer, the dimensionality of the output space
+            (i.e. the number output of filters in the convolution)
+        bottleneck: if True, adds a bottleneck convolution block
         dropout_rate: dropout rate
         weight_decay: weight decay factor
-    Returns: keras tensor with batch_norm, relu and convolution2d added (optional bottleneck)
+        block_prefix: str, for unique layer naming
+
+     # Input shape
+        4D tensor with shape:
+        `(samples, channels, rows, cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+    # Output shape
+        4D tensor with shape:
+        `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
+        `rows` and `cols` values might have changed due to stride.
+
+    # Returns
+        output tensor of block
     '''
+    with K.name_scope('ConvBlock'):
+        concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
 
-    concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+        x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name=name_or_none(block_prefix, '_bn'))(ip)
+        x = Activation('relu')(x)
 
-    x = BatchNormalization(axis=concat_axis, gamma_regularizer=l2(weight_decay),
-                           beta_regularizer=l2(weight_decay))(ip)
-    x = Activation('relu')(x)
+        if bottleneck:
+            inter_channel = nb_filter * 4
 
-    if bottleneck:
-        inter_channel = nb_filter * 4  # Obtained from https://github.com/liuzhuang13/DenseNet/blob/master/densenet.lua
-
-        x = Conv2D(inter_channel, (1, 1), kernel_initializer='he_uniform', padding='same', use_bias=False,
-                   kernel_regularizer=l2(weight_decay))(x)
+            x = Conv2D(inter_channel, (1, 1), kernel_initializer='he_normal', padding='same', use_bias=False,
+                       kernel_regularizer=l2(weight_decay), name=name_or_none(block_prefix, '_bottleneck_conv2D'))(x)
+            x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5,
+                                   name=name_or_none(block_prefix, '_bottleneck_bn'))(x)
+            x = Activation('relu')(x)
 
+        x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_normal', padding='same', use_bias=False,
+                   name=name_or_none(block_prefix, '_conv2D'))(x)
         if dropout_rate:
             x = Dropout(dropout_rate)(x)
 
-        x = BatchNormalization(axis=concat_axis, gamma_regularizer=l2(weight_decay),
-                               beta_regularizer=l2(weight_decay))(x)
+    return x
+
+
+def __dense_block(x, nb_layers, nb_filter, growth_rate, bottleneck=False, dropout_rate=None,
+                  weight_decay=1e-4, grow_nb_filters=True, return_concat_list=False, block_prefix=None):
+    '''
+    Build a dense_block where the output of each conv_block is fed
+    to subsequent ones
+
+    # Arguments
+        x: input keras tensor
+        nb_layers: the number of conv_blocks to append to the model
+        nb_filter: integer, the dimensionality of the output space
+            (i.e. the number output of filters in the convolution)
+        growth_rate: growth rate of the dense block
+        bottleneck: if True, adds a bottleneck convolution block to
+            each conv_block
+        dropout_rate: dropout rate
+        weight_decay: weight decay factor
+        grow_nb_filters: if True, allows number of filters to grow
+        return_concat_list: set to True to return the list of
+            feature maps along with the actual output
+        block_prefix: str, for block unique naming
+
+    # Return
+        If return_concat_list is True, returns a list of the output
+        keras tensor, the number of filters and a list of all the
+        dense blocks added to the keras tensor
+
+        If return_concat_list is False, returns a list of the output
+        keras tensor and the number of filters
+    '''
+    with K.name_scope('DenseBlock'):
+        concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+
+        x_list = [x]
+
+        for i in range(nb_layers):
+            cb = __conv_block(x, growth_rate, bottleneck, dropout_rate, weight_decay,
+                              block_prefix=name_or_none(block_prefix, '_%i' % i))
+            x_list.append(cb)
+
+            x = concatenate([x, cb], axis=concat_axis)
+
+            if grow_nb_filters:
+                nb_filter += growth_rate
+
+        if return_concat_list:
+            return x, nb_filter, x_list
+        else:
+            return x, nb_filter
+
+
+def __transition_block(ip, nb_filter, compression=1.0, weight_decay=1e-4, block_prefix=None):
+    '''
+    Adds a pointwise convolution layer (with batch normalization and relu),
+    and an average pooling layer. The number of output convolution filters
+    can be reduced by appropriately reducing the compression parameter.
+
+    # Arguments
+        ip: input keras tensor
+        nb_filter: integer, the dimensionality of the output space
+            (i.e. the number output of filters in the convolution)
+        compression: calculated as 1 - reduction. Reduces the number
+            of feature maps in the transition block.
+        weight_decay: weight decay factor
+        block_prefix: str, for block unique naming
+
+    # Input shape
+        4D tensor with shape:
+        `(samples, channels, rows, cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+    # Output shape
+        4D tensor with shape:
+        `(samples, nb_filter * compression, rows / 2, cols / 2)`
+        if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows / 2, cols / 2, nb_filter * compression)`
+        if data_format='channels_last'.
+
+    # Returns
+        a keras tensor
+    '''
+    with K.name_scope('Transition'):
+        concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+
+        x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name=name_or_none(block_prefix, '_bn'))(ip)
         x = Activation('relu')(x)
+        x = Conv2D(int(nb_filter * compression), (1, 1), kernel_initializer='he_normal', padding='same',
+                   use_bias=False, kernel_regularizer=l2(weight_decay), name=name_or_none(block_prefix, '_conv2D'))(x)
+        x = AveragePooling2D((2, 2), strides=(2, 2))(x)
 
-    x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_uniform', padding='same', use_bias=False,
-               kernel_regularizer=l2(weight_decay))(x)
-    if dropout_rate:
-        x = Dropout(dropout_rate)(x)
-
-    return x
+        return x
 
 
-def __transition_block(ip, nb_filter, compression=1.0, dropout_rate=None, weight_decay=1E-4):
-    ''' Apply BatchNorm, Relu 1x1, Conv2D, optional compression, dropout and Maxpooling2D
-    Args:
-        ip: keras tensor
-        nb_filter: number of filters
-        compression: calculated as 1 - reduction. Reduces the number of feature maps
-                    in the transition block.
-        dropout_rate: dropout rate
+def __transition_up_block(ip, nb_filters, type='deconv', weight_decay=1E-4, block_prefix=None):
+    '''Adds an upsampling block. Upsampling operation relies on the the type parameter.
+
+    # Arguments
+        ip: input keras tensor
+        nb_filters: integer, the dimensionality of the output space
+            (i.e. the number output of filters in the convolution)
+        type: can be 'upsampling', 'subpixel', 'deconv'. Determines
+            type of upsampling performed
         weight_decay: weight decay factor
-    Returns: keras tensor, after applying batch_norm, relu-conv, dropout, maxpool
+        block_prefix: str, for block unique naming
+
+    # Input shape
+        4D tensor with shape:
+        `(samples, channels, rows, cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+    # Output shape
+        4D tensor with shape:
+        `(samples, nb_filter, rows * 2, cols * 2)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows * 2, cols * 2, nb_filter)` if data_format='channels_last'.
+
+    # Returns
+        a keras tensor
     '''
+    with K.name_scope('TransitionUp'):
 
-    concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
-
-    x = BatchNormalization(axis=concat_axis, gamma_regularizer=l2(weight_decay),
-                           beta_regularizer=l2(weight_decay))(ip)
-    x = Activation('relu')(x)
-    x = Conv2D(int(nb_filter * compression), (1, 1), kernel_initializer='he_uniform', padding='same', use_bias=False,
-               kernel_regularizer=l2(weight_decay))(x)
-    if dropout_rate:
-        x = Dropout(dropout_rate)(x)
-    x = AveragePooling2D((2, 2), strides=(2, 2))(x)
-
-    return x
-
-
-def __dense_block(x, nb_layers, nb_filter, growth_rate, bottleneck=False, dropout_rate=None, weight_decay=1E-4,
-                  grow_nb_filters=True, return_concat_list=False):
-    ''' Build a dense_block where the output of each conv_block is fed to subsequent ones
-    Args:
-        x: keras tensor
-        nb_layers: the number of layers of conv_block to append to the model.
-        nb_filter: number of filters
-        growth_rate: growth rate
-        bottleneck: bottleneck block
-        dropout_rate: dropout rate
-        weight_decay: weight decay factor
-        grow_nb_filters: flag to decide to allow number of filters to grow
-        return_concat_list: return the list of feature maps along with the actual output
-    Returns: keras tensor with nb_layers of conv_block appended
-    '''
-
-    concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
-
-    x_list = [x]
-
-    for i in range(nb_layers):
-        conv_block = __conv_block(x, growth_rate, bottleneck, dropout_rate, weight_decay)
-        x_list.append(conv_block)
-
-        x = concatenate([x, conv_block], axis=concat_axis)
-
-        if grow_nb_filters:
-            nb_filter += growth_rate
-
-    if return_concat_list:
-        return x, nb_filter, x_list
-    else:
-        return x, nb_filter
-
-
-def __transition_up_block(ip, nb_filters, type='upsampling', weight_decay=1E-4):
-    ''' SubpixelConvolutional Upscaling (factor = 2)
-    Args:
-        ip: keras tensor
-        nb_filters: number of layers
-        type: can be 'upsampling', 'subpixel', 'deconv'. Determines type of upsampling performed
-        weight_decay: weight decay factor
-    Returns: keras tensor, after applying upsampling operation.
-    '''
-
-    if type == 'upsampling':
-        x = UpSampling2D()(ip)
-    elif type == 'subpixel':
-        x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', W_regularizer=l2(weight_decay),
-                   use_bias=False, kernel_initializer='he_uniform')(ip)
-        x = SubPixelUpscaling(scale_factor=2)(x)
-        x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', W_regularizer=l2(weight_decay),
-                   use_bias=False, kernel_initializer='he_uniform')(x)
-    else:
-        x = Conv2DTranspose(nb_filters, (3, 3), activation='relu', padding='same', strides=(2, 2),
-                            kernel_initializer='he_uniform')(ip)
-
-    return x
+        if type == 'upsampling':
+            x = UpSampling2D(name=name_or_none(block_prefix, '_upsampling'))(ip)
+        elif type == 'subpixel':
+            x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay),
+                       use_bias=False, kernel_initializer='he_normal', name=name_or_none(block_prefix, '_conv2D'))(ip)
+            x = SubPixelUpscaling(scale_factor=2, name=name_or_none(block_prefix, '_subpixel'))(x)
+            x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay),
+                       use_bias=False, kernel_initializer='he_normal', name=name_or_none(block_prefix, '_conv2D'))(x)
+        else:
+            x = Conv2DTranspose(nb_filters, (3, 3), activation='relu', padding='same', strides=(2, 2),
+                                kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay),
+                                name=name_or_none(block_prefix, '_conv2DT'))(ip)
+        return x
 
 
 def __create_dense_net(nb_classes, img_input, include_top, depth=40, nb_dense_block=3, growth_rate=12, nb_filter=-1,
-                       nb_layers_per_block=-1, bottleneck=False, reduction=0.0, dropout_rate=None, weight_decay=1E-4,
-                       activation='softmax'):
+                       nb_layers_per_block=-1, bottleneck=False, reduction=0.0, dropout_rate=None, weight_decay=1e-4,
+                       subsample_initial_block=False, pooling=None, activation='softmax'):
     ''' Build the DenseNet model
-    Args:
+
+    # Arguments
         nb_classes: number of classes
         img_input: tuple of shape (channels, rows, columns) or (rows, columns, channels)
         include_top: flag to include the final Dense layer
@@ -442,79 +723,120 @@ def __create_dense_net(nb_classes, img_input, include_top, depth=40, nb_dense_bl
         bottleneck: add bottleneck blocks
         reduction: reduction factor of transition blocks. Note : reduction value is inverted to compute compression
         dropout_rate: dropout rate
-        weight_decay: weight decay
+        weight_decay: weight decay rate
+        subsample_initial_block: Changes model type to suit different datasets.
+            Should be set to True for ImageNet, and False for CIFAR datasets.
+            When set to True, the initial convolution will be strided and
+            adds a MaxPooling2D before the initial dense block.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
         activation: Type of activation at the top layer. Can be one of 'softmax' or 'sigmoid'.
                 Note that if sigmoid is used, classes must be 1.
-    Returns: keras tensor with nb_layers of conv_block appended
+
+    # Returns
+        a keras tensor
+
+    # Raises
+        ValueError: in case of invalid argument for `reduction`
+            or `nb_dense_block`
     '''
+    with K.name_scope('DenseNet'):
+        concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
 
-    concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+        if reduction != 0.0:
+            if not (reduction <= 1.0 and reduction > 0.0):
+                raise ValueError('`reduction` value must lie between 0.0 and 1.0')
 
-    assert (depth - 4) % 3 == 0, 'Depth must be 3 N + 4'
-    if reduction != 0.0:
-        assert reduction <= 1.0 and reduction > 0.0, 'reduction value must lie between 0.0 and 1.0'
+        # layers in each dense block
+        if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple:
+            nb_layers = list(nb_layers_per_block)  # Convert tuple to list
 
-    # layers in each dense block
-    if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple:
-        nb_layers = list(nb_layers_per_block)  # Convert tuple to list
+            if len(nb_layers) != (nb_dense_block):
+                raise ValueError('If `nb_dense_block` is a list, its length must match '
+                                 'the number of layers provided by `nb_layers`.')
 
-        assert len(nb_layers) == (nb_dense_block + 1), 'If list, nb_layer is used as provided. ' \
-                                                       'Note that list size must be (nb_dense_block + 1)'
-        final_nb_layer = nb_layers[-1]
-        nb_layers = nb_layers[:-1]
-    else:
-        if nb_layers_per_block == -1:
-            count = int((depth - 4) / 3)
-            nb_layers = [count for _ in range(nb_dense_block)]
-            final_nb_layer = count
+            final_nb_layer = nb_layers[-1]
+            nb_layers = nb_layers[:-1]
         else:
-            final_nb_layer = nb_layers_per_block
-            nb_layers = [nb_layers_per_block] * nb_dense_block
+            if nb_layers_per_block == -1:
+                assert (depth - 4) % 3 == 0, 'Depth must be 3 N + 4 if nb_layers_per_block == -1'
+                count = int((depth - 4) / 3)
+                nb_layers = [count for _ in range(nb_dense_block)]
+                final_nb_layer = count
+            else:
+                final_nb_layer = nb_layers_per_block
+                nb_layers = [nb_layers_per_block] * nb_dense_block
 
-    if bottleneck:
-        nb_layers = [int(layer // 2) for layer in nb_layers]
+        # compute initial nb_filter if -1, else accept users initial nb_filter
+        if nb_filter <= 0:
+            nb_filter = 2 * growth_rate
 
-    # compute initial nb_filter if -1, else accept users initial nb_filter
-    if nb_filter <= 0:
-        nb_filter = 2 * growth_rate
+        # compute compression factor
+        compression = 1.0 - reduction
 
-    # compute compression factor
-    compression = 1.0 - reduction
+        # Initial convolution
+        if subsample_initial_block:
+            initial_kernel = (7, 7)
+            initial_strides = (2, 2)
+        else:
+            initial_kernel = (3, 3)
+            initial_strides = (1, 1)
 
-    # Initial convolution
-    x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_uniform', padding='same', name='initial_conv2D',
-               use_bias=False, kernel_regularizer=l2(weight_decay))(img_input)
+        x = Conv2D(nb_filter, initial_kernel, kernel_initializer='he_normal', padding='same', name='initial_conv2D',
+                   strides=initial_strides, use_bias=False, kernel_regularizer=l2(weight_decay))(img_input)
 
-    # Add dense blocks
-    for block_idx in range(nb_dense_block - 1):
-        x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter, growth_rate, bottleneck=bottleneck,
-                                     dropout_rate=dropout_rate, weight_decay=weight_decay)
-        # add transition_block
-        x = __transition_block(x, nb_filter, compression=compression, dropout_rate=dropout_rate,
-                               weight_decay=weight_decay)
-        nb_filter = int(nb_filter * compression)
+        if subsample_initial_block:
+            x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name='initial_bn')(x)
+            x = Activation('relu')(x)
+            x = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
 
-    # The last dense_block does not have a transition_block
-    x, nb_filter = __dense_block(x, final_nb_layer, nb_filter, growth_rate, bottleneck=bottleneck,
-                                 dropout_rate=dropout_rate, weight_decay=weight_decay)
+        # Add dense blocks
+        for block_idx in range(nb_dense_block - 1):
+            x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter, growth_rate, bottleneck=bottleneck,
+                                         dropout_rate=dropout_rate, weight_decay=weight_decay,
+                                         block_prefix='dense_%i' % block_idx)
+            # add transition_block
+            x = __transition_block(x, nb_filter, compression=compression, weight_decay=weight_decay,
+                                   block_prefix='tr_%i' % block_idx)
+            nb_filter = int(nb_filter * compression)
 
-    x = BatchNormalization(axis=concat_axis, gamma_regularizer=l2(weight_decay),
-                           beta_regularizer=l2(weight_decay))(x)
-    x = Activation('relu')(x)
-    x = GlobalAveragePooling2D()(x)
+        # The last dense_block does not have a transition_block
+        x, nb_filter = __dense_block(x, final_nb_layer, nb_filter, growth_rate, bottleneck=bottleneck,
+                                     dropout_rate=dropout_rate, weight_decay=weight_decay,
+                                     block_prefix='dense_%i' % (nb_dense_block - 1))
 
-    if include_top:
-        x = Dense(nb_classes, activation=activation, W_regularizer=l2(weight_decay), b_regularizer=l2(weight_decay))(x)
+        x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name='final_bn')(x)
+        x = Activation('relu')(x)
 
-    return x
+        if include_top:
+            x = GlobalAveragePooling2D()(x)
+            x = Dense(nb_classes, activation=activation)(x)
+        else:
+            if pooling == 'avg':
+                x = GlobalAveragePooling2D()(x)
+            if pooling == 'max':
+                x = GlobalMaxPooling2D()(x)
+
+        return x
 
 
 def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5, growth_rate=12,
-                           reduction=0.0, dropout_rate=None, weight_decay=1E-4,
-                           nb_layers_per_block=4, nb_upsampling_conv=128, upsampling_type='deconv',
-                           init_conv_filters=48, input_shape=None, activation='softmax'):
-    ''' Build the DenseNet model
-    Args:
+                           reduction=0.0, dropout_rate=None, weight_decay=1e-4,
+                           nb_layers_per_block=4, nb_upsampling_conv=128, upsampling_type='upsampling',
+                           init_conv_filters=48, input_shape=None, activation='deconv'):
+    ''' Build the DenseNet-FCN model
+
+    # Arguments
         nb_classes: number of classes
         img_input: tuple of shape (channels, rows, columns) or (rows, columns, channels)
         include_top: flag to include the final Dense layer
@@ -534,104 +856,116 @@ def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5,
         input_shape: Only used for shape inference in fully convolutional networks.
         activation: Type of activation at the top layer. Can be one of 'softmax' or 'sigmoid'.
                     Note that if sigmoid is used, classes must be 1.
-    Returns: keras tensor with nb_layers of conv_block appended
+
+    # Returns
+        a keras tensor
+
+    # Raises
+        ValueError: in case of invalid argument for `reduction`,
+            `nb_dense_block` or `nb_upsampling_conv`.
     '''
+    with K.name_scope('DenseNetFCN'):
+        concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
 
-    concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
-
-    if concat_axis == 1:  # channels_first dim ordering
-        _, rows, cols = input_shape
-    else:
-        rows, cols, _ = input_shape
-
-    if reduction != 0.0:
-        assert reduction <= 1.0 and reduction > 0.0, 'reduction value must lie between 0.0 and 1.0'
-
-    # check if upsampling_conv has minimum number of filters
-    # minimum is set to 12, as at least 3 color channels are needed for correct upsampling
-    assert nb_upsampling_conv > 12 and nb_upsampling_conv % 4 == 0, 'Parameter `upsampling_conv` number of channels must ' \
-                                                                    'be a positive number divisible by 4 and greater ' \
-                                                                    'than 12'
-
-    # layers in each dense block
-    if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple:
-        nb_layers = list(nb_layers_per_block)  # Convert tuple to list
-
-        assert len(nb_layers) == (nb_dense_block + 1), 'If list, nb_layer is used as provided. ' \
-                                                       'Note that list size must be (nb_dense_block + 1)'
-
-        bottleneck_nb_layers = nb_layers[-1]
-        rev_layers = nb_layers[::-1]
-        nb_layers.extend(rev_layers[1:])
-    else:
-        bottleneck_nb_layers = nb_layers_per_block
-        nb_layers = [nb_layers_per_block] * (2 * nb_dense_block + 1)
-
-    # compute compression factor
-    compression = 1.0 - reduction
-
-    # Initial convolution
-    x = Conv2D(init_conv_filters, (3, 3), kernel_initializer='he_uniform', padding='same', name='initial_conv2D',
-               use_bias=False, kernel_regularizer=l2(weight_decay))(img_input)
-
-    nb_filter = init_conv_filters
-
-    skip_list = []
-
-    # Add dense blocks and transition down block
-    for block_idx in range(nb_dense_block):
-        x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter, growth_rate,
-                                     dropout_rate=dropout_rate, weight_decay=weight_decay)
-
-        # Skip connection
-        skip_list.append(x)
-
-        # add transition_block
-        x = __transition_block(x, nb_filter, compression=compression, dropout_rate=dropout_rate,
-                               weight_decay=weight_decay)
-
-        nb_filter = int(nb_filter * compression)  # this is calculated inside transition_down_block
-
-    # The last dense_block does not have a transition_down_block
-    # return the concatenated feature maps without the concatenation of the input
-    _, nb_filter, concat_list = __dense_block(x, bottleneck_nb_layers, nb_filter, growth_rate,
-                                              dropout_rate=dropout_rate, weight_decay=weight_decay,
-                                              return_concat_list=True)
-
-    skip_list = skip_list[::-1]  # reverse the skip list
-
-    # Add dense blocks and transition up block
-    for block_idx in range(nb_dense_block):
-        n_filters_keep = growth_rate * nb_layers[nb_dense_block + block_idx]
-
-        # upsampling block must upsample only the feature maps (concat_list[1:]),
-        # not the concatenation of the input with the feature maps (concat_list[0].
-        l = concatenate(concat_list[1:], axis=concat_axis)
-
-        t = __transition_up_block(l, nb_filters=n_filters_keep, type=upsampling_type)
-
-        # concatenate the skip connection with the transition block
-        x = concatenate([t, skip_list[block_idx]], axis=concat_axis)
-
-        # Dont allow the feature map size to grow in upsampling dense blocks
-        x_up, nb_filter, concat_list = __dense_block(x, nb_layers[nb_dense_block + block_idx + 1], nb_filter=growth_rate,
-                                                     growth_rate=growth_rate, dropout_rate=dropout_rate,
-                                                     weight_decay=weight_decay,
-                                                     return_concat_list=True, grow_nb_filters=False)
-
-    if include_top:
-        x = Conv2D(nb_classes, (1, 1), activation='linear', padding='same', kernel_regularizer=l2(weight_decay),
-                   use_bias=False)(x_up)
-
-        if K.image_data_format() == 'channels_first':
-            channel, row, col = input_shape
+        if concat_axis == 1:  # channels_first dim ordering
+            _, rows, cols = input_shape
         else:
-            row, col, channel = input_shape
+            rows, cols, _ = input_shape
 
-        x = Reshape((row * col, nb_classes))(x)
-        x = Activation(activation)(x)
-        x = Reshape((row, col, nb_classes))(x)
-    else:
-        x = x_up
+        if reduction != 0.0:
+            if not (reduction <= 1.0 and reduction > 0.0):
+                raise ValueError('`reduction` value must lie between 0.0 and 1.0')
 
-    return x
+        # check if upsampling_conv has minimum number of filters
+        # minimum is set to 12, as at least 3 color channels are needed for correct upsampling
+        if not (nb_upsampling_conv > 12 and nb_upsampling_conv % 4 == 0):
+            raise ValueError('Parameter `nb_upsampling_conv` number of channels must '
+                             'be a positive number divisible by 4 and greater than 12')
+
+        # layers in each dense block
+        if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple:
+            nb_layers = list(nb_layers_per_block)  # Convert tuple to list
+
+            if len(nb_layers) != (nb_dense_block + 1):
+                raise ValueError('If `nb_dense_block` is a list, its length must be '
+                                 '(`nb_dense_block` + 1)')
+
+            bottleneck_nb_layers = nb_layers[-1]
+            rev_layers = nb_layers[::-1]
+            nb_layers.extend(rev_layers[1:])
+        else:
+            bottleneck_nb_layers = nb_layers_per_block
+            nb_layers = [nb_layers_per_block] * (2 * nb_dense_block + 1)
+
+        # compute compression factor
+        compression = 1.0 - reduction
+
+        # Initial convolution
+        x = Conv2D(init_conv_filters, (7, 7), kernel_initializer='he_normal', padding='same', name='initial_conv2D',
+                   use_bias=False, kernel_regularizer=l2(weight_decay))(img_input)
+        x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name='initial_bn')(x)
+        x = Activation('relu')(x)
+
+        nb_filter = init_conv_filters
+
+        skip_list = []
+
+        # Add dense blocks and transition down block
+        for block_idx in range(nb_dense_block):
+            x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter, growth_rate, dropout_rate=dropout_rate,
+                                         weight_decay=weight_decay, block_prefix='dense_%i' % block_idx)
+
+            # Skip connection
+            skip_list.append(x)
+
+            # add transition_block
+            x = __transition_block(x, nb_filter, compression=compression, weight_decay=weight_decay,
+                                   block_prefix='tr_%i' % block_idx)
+
+            nb_filter = int(nb_filter * compression)  # this is calculated inside transition_down_block
+
+        # The last dense_block does not have a transition_down_block
+        # return the concatenated feature maps without the concatenation of the input
+        _, nb_filter, concat_list = __dense_block(x, bottleneck_nb_layers, nb_filter, growth_rate,
+                                                  dropout_rate=dropout_rate, weight_decay=weight_decay,
+                                                  return_concat_list=True,
+                                                  block_prefix='dense_%i' % nb_dense_block)
+
+        skip_list = skip_list[::-1]  # reverse the skip list
+
+        # Add dense blocks and transition up block
+        for block_idx in range(nb_dense_block):
+            n_filters_keep = growth_rate * nb_layers[nb_dense_block + block_idx]
+
+            # upsampling block must upsample only the feature maps (concat_list[1:]),
+            # not the concatenation of the input with the feature maps (concat_list[0].
+            l = concatenate(concat_list[1:], axis=concat_axis)
+
+            t = __transition_up_block(l, nb_filters=n_filters_keep, type=upsampling_type, weight_decay=weight_decay,
+                                      block_prefix='tr_up_%i' % block_idx)
+
+            # concatenate the skip connection with the transition block
+            x = concatenate([t, skip_list[block_idx]], axis=concat_axis)
+
+            # Dont allow the feature map size to grow in upsampling dense blocks
+            x_up, nb_filter, concat_list = __dense_block(x, nb_layers[nb_dense_block + block_idx + 1],
+                                                         nb_filter=growth_rate, growth_rate=growth_rate,
+                                                         dropout_rate=dropout_rate, weight_decay=weight_decay,
+                                                         return_concat_list=True, grow_nb_filters=False,
+                                                         block_prefix='dense_%i' % (nb_dense_block + 1 + block_idx))
+
+        if include_top:
+            x = Conv2D(nb_classes, (1, 1), activation='linear', padding='same', use_bias=False)(x_up)
+
+            if K.image_data_format() == 'channels_first':
+                channel, row, col = input_shape
+            else:
+                row, col, channel = input_shape
+
+            x = Reshape((row * col, nb_classes))(x)
+            x = Activation(activation)(x)
+            x = Reshape((row, col, nb_classes))(x)
+        else:
+            x = x_up
+
+        return x
diff --git a/keras_contrib/applications/nasnet.py b/keras_contrib/applications/nasnet.py
new file mode 100644
index 0000000..89ebb98
--- /dev/null
+++ b/keras_contrib/applications/nasnet.py
@@ -0,0 +1,773 @@
+"""Collection of NASNet models
+
+The reference paper:
+ - [Learning Transferable Architectures for Scalable Image Recognition]
+    (https://arxiv.org/abs/1707.07012)
+
+The reference implementation:
+1. TF Slim
+ - https://github.com/tensorflow/models/blob/master/research/slim/nets/
+   nasnet/nasnet.py
+2. TensorNets
+ - https://github.com/taehoonlee/tensornets/blob/master/tensornets/nasnets.py
+3. Weights
+ - https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet
+"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+
+import warnings
+
+from keras.models import Model
+from keras.layers import Input
+from keras.layers import Activation
+from keras.layers import Dense
+from keras.layers import Dropout
+from keras.layers import BatchNormalization
+from keras.layers import MaxPooling2D
+from keras.layers import AveragePooling2D
+from keras.layers import GlobalAveragePooling2D
+from keras.layers import GlobalMaxPooling2D
+from keras.layers import Conv2D
+from keras.layers import SeparableConv2D
+from keras.layers import ZeroPadding2D
+from keras.layers import Cropping2D
+from keras.layers import concatenate
+from keras.layers import add
+from keras.regularizers import l2
+from keras.utils.data_utils import get_file
+from keras.engine.topology import get_source_inputs
+from keras.applications.imagenet_utils import _obtain_input_shape
+from keras.applications.inception_v3 import preprocess_input
+from keras.applications.imagenet_utils import decode_predictions
+from keras import backend as K
+
+_BN_DECAY = 0.9997
+_BN_EPSILON = 1e-3
+
+NASNET_MOBILE_WEIGHT_PATH = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.0/NASNet-mobile.h5"
+NASNET_MOBILE_WEIGHT_PATH_NO_TOP = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.0/NASNet-mobile-no-top.h5"
+NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.0/NASNet-auxiliary-mobile.h5"
+NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY_NO_TOP = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.0/NASNet-auxiliary-mobile-no-top.h5"
+NASNET_LARGE_WEIGHT_PATH = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.1/NASNet-large.h5"
+NASNET_LARGE_WEIGHT_PATH_NO_TOP = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.1/NASNet-large-no-top.h5"
+NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.1/NASNet-auxiliary-large.h5"
+NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary_NO_TOP = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.1/NASNet-auxiliary-large-no-top.h5"
+
+
+def NASNet(input_shape=None,
+           penultimate_filters=4032,
+           nb_blocks=6,
+           stem_filters=96,
+           skip_reduction=True,
+           use_auxiliary_branch=False,
+           filters_multiplier=2,
+           dropout=0.5,
+           weight_decay=5e-5,
+           include_top=True,
+           weights=None,
+           input_tensor=None,
+           pooling=None,
+           classes=1000,
+           default_size=None):
+    """Instantiates a NASNet architecture.
+    Note that only TensorFlow is supported for now,
+    therefore it only works with the data format
+    `image_data_format='channels_last'` in your Keras config
+    at `~/.keras/keras.json`.
+
+    # Arguments
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(331, 331, 3)` for NASNetLarge or
+            `(224, 224, 3)` for NASNetMobile
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 32.
+            E.g. `(224, 224, 3)` would be one valid value.
+        penultimate_filters: number of filters in the penultimate layer.
+            NASNet models use the notation `NASNet (N @ P)`, where:
+                -   N is the number of blocks
+                -   P is the number of penultimate filters
+        nb_blocks: number of repeated blocks of the NASNet model.
+            NASNet models use the notation `NASNet (N @ P)`, where:
+                -   N is the number of blocks
+                -   P is the number of penultimate filters
+        stem_filters: number of filters in the initial stem block
+        skip_reduction: Whether to skip the reduction step at the tail
+            end of the network. Set to `False` for CIFAR models.
+        use_auxiliary_branch: Whether to use the auxiliary branch during
+            training or evaluation.
+        filters_multiplier: controls the width of the network.
+            - If `filters_multiplier` < 1.0, proportionally decreases the number
+                of filters in each layer.
+            - If `filters_multiplier` > 1.0, proportionally increases the number
+                of filters in each layer.
+            - If `filters_multiplier` = 1, default number of filters from the paper
+                 are used at each layer.
+        dropout: dropout rate
+        weight_decay: l2 regularization weight
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: `None` (random initialization) or
+            `imagenet` (ImageNet weights)
+        input_tensor: optional Keras tensor (i.e. output of
+            `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        default_size: specifies the default image size of the model
+    # Returns
+        A Keras model instance.
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+    """
+    if K.backend() != 'tensorflow':
+        raise RuntimeError('Only Tensorflow backend is currently supported, '
+                           'as other backends do not support '
+                           'separable convolution.')
+
+    if weights not in {'imagenet', None}:
+        raise ValueError('The `weights` argument should be either '
+                         '`None` (random initialization) or `imagenet` '
+                         '(pre-training on ImageNet).')
+
+    if weights == 'imagenet' and include_top and classes != 1000:
+        raise ValueError('If using `weights` as ImageNet with `include_top` '
+                         'as true, `classes` should be 1000')
+
+    if default_size is None:
+        default_size = 331
+
+    # Determine proper input shape and default size.
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=default_size,
+                                      min_size=32,
+                                      data_format=K.image_data_format(),
+                                      require_flatten=include_top or weights)
+
+    if K.image_data_format() != 'channels_last':
+        warnings.warn('The NASNet family of models is only available '
+                      'for the input data format "channels_last" '
+                      '(width, height, channels). '
+                      'However your settings specify the default '
+                      'data format "channels_first" (channels, width, height).'
+                      ' You should set `image_data_format="channels_last"` '
+                      'in your Keras config located at ~/.keras/keras.json. '
+                      'The model being returned right now will expect inputs '
+                      'to follow the "channels_last" data format.')
+        K.set_image_data_format('channels_last')
+        old_data_format = 'channels_first'
+    else:
+        old_data_format = None
+
+    if input_tensor is None:
+        img_input = Input(shape=input_shape)
+    else:
+        if not K.is_keras_tensor(input_tensor):
+            img_input = Input(tensor=input_tensor, shape=input_shape)
+        else:
+            img_input = input_tensor
+
+    assert penultimate_filters % 24 == 0, "`penultimate_filters` needs to be divisible " \
+                                          "by 24."
+
+    channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+    filters = penultimate_filters // 24
+
+    if not skip_reduction:
+        x = Conv2D(stem_filters, (3, 3), strides=(2, 2), padding='valid', use_bias=False, name='stem_conv1',
+                   kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(img_input)
+    else:
+        x = Conv2D(stem_filters, (3, 3), strides=(1, 1), padding='same', use_bias=False, name='stem_conv1',
+                   kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(img_input)
+
+    x = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON,
+                           name='stem_bn1')(x)
+
+    p = None
+    if not skip_reduction:  # imagenet / mobile mode
+        x, p = _reduction_A(x, p, filters // (filters_multiplier ** 2), weight_decay, id='stem_1')
+        x, p = _reduction_A(x, p, filters // filters_multiplier, weight_decay, id='stem_2')
+
+    for i in range(nb_blocks):
+        x, p = _normal_A(x, p, filters, weight_decay, id='%d' % (i))
+
+    x, p0 = _reduction_A(x, p, filters * filters_multiplier, weight_decay, id='reduce_%d' % (nb_blocks))
+
+    p = p0 if not skip_reduction else p
+
+    for i in range(nb_blocks):
+        x, p = _normal_A(x, p, filters * filters_multiplier, weight_decay, id='%d' % (nb_blocks + i + 1))
+
+    auxiliary_x = None
+    if not skip_reduction:  # imagenet / mobile mode
+        if use_auxiliary_branch:
+            auxiliary_x = _add_auxiliary_head(x, classes, weight_decay)
+
+    x, p0 = _reduction_A(x, p, filters * filters_multiplier ** 2, weight_decay, id='reduce_%d' % (2 * nb_blocks))
+
+    if skip_reduction:  # CIFAR mode
+        if use_auxiliary_branch:
+            auxiliary_x = _add_auxiliary_head(x, classes, weight_decay)
+
+    p = p0 if not skip_reduction else p
+
+    for i in range(nb_blocks):
+        x, p = _normal_A(x, p, filters * filters_multiplier ** 2, weight_decay, id='%d' % (2 * nb_blocks + i + 1))
+
+    x = Activation('relu')(x)
+
+    if include_top:
+        x = GlobalAveragePooling2D()(x)
+        x = Dropout(dropout)(x)
+        x = Dense(classes, activation='softmax', kernel_regularizer=l2(weight_decay), name='predictions')(x)
+    else:
+        if pooling == 'avg':
+            x = GlobalAveragePooling2D()(x)
+        elif pooling == 'max':
+            x = GlobalMaxPooling2D()(x)
+
+    # Ensure that the model takes into account
+    # any potential predecessors of `input_tensor`.
+    if input_tensor is not None:
+        inputs = get_source_inputs(input_tensor)
+    else:
+        inputs = img_input
+
+    # Create model.
+    if use_auxiliary_branch:
+        model = Model(inputs, [x, auxiliary_x], name='NASNet_with_auxiliary')
+    else:
+        model = Model(inputs, x, name='NASNet')
+
+    # load weights
+    if weights == 'imagenet':
+        if default_size == 224:  # mobile version
+            if include_top:
+                if use_auxiliary_branch:
+                    weight_path = NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY
+                    model_name = 'nasnet_mobile_with_aux.h5'
+                else:
+                    weight_path = NASNET_MOBILE_WEIGHT_PATH
+                    model_name = 'nasnet_mobile.h5'
+            else:
+                if use_auxiliary_branch:
+                    weight_path = NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY_NO_TOP
+                    model_name = 'nasnet_mobile_with_aux_no_top.h5'
+                else:
+                    weight_path = NASNET_MOBILE_WEIGHT_PATH_NO_TOP
+                    model_name = 'nasnet_mobile_no_top.h5'
+
+            weights_file = get_file(model_name, weight_path, cache_subdir='models')
+            model.load_weights(weights_file, by_name=True)
+
+        elif default_size == 331:  # large version
+            if include_top:
+                if use_auxiliary_branch:
+                    weight_path = NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary
+                    model_name = 'nasnet_large_with_aux.h5'
+                else:
+                    weight_path = NASNET_LARGE_WEIGHT_PATH
+                    model_name = 'nasnet_large.h5'
+            else:
+                if use_auxiliary_branch:
+                    weight_path = NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary_NO_TOP
+                    model_name = 'nasnet_large_with_aux_no_top.h5'
+                else:
+                    weight_path = NASNET_LARGE_WEIGHT_PATH_NO_TOP
+                    model_name = 'nasnet_large_no_top.h5'
+
+            weights_file = get_file(model_name, weight_path, cache_subdir='models')
+            model.load_weights(weights_file, by_name=True)
+
+        else:
+            raise ValueError('ImageNet weights can only be loaded on NASNetLarge or NASNetMobile')
+
+    if old_data_format:
+        K.set_image_data_format(old_data_format)
+
+    return model
+
+
+def NASNetLarge(input_shape=(331, 331, 3),
+                dropout=0.5,
+                weight_decay=5e-5,
+                use_auxiliary_branch=False,
+                include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                pooling=None,
+                classes=1000):
+    """Instantiates a NASNet architecture in ImageNet mode.
+    Note that only TensorFlow is supported for now,
+    therefore it only works with the data format
+    `image_data_format='channels_last'` in your Keras config
+    at `~/.keras/keras.json`.
+
+    # Arguments
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(331, 331, 3)` for NASNetLarge.
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 32.
+            E.g. `(224, 224, 3)` would be one valid value.
+        use_auxiliary_branch: Whether to use the auxiliary branch during
+            training or evaluation.
+        dropout: dropout rate
+        weight_decay: l2 regularization weight
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: `None` (random initialization) or
+            `imagenet` (ImageNet weights)
+        input_tensor: optional Keras tensor (i.e. output of
+            `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        default_size: specifies the default image size of the model
+    # Returns
+        A Keras model instance.
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+    """
+    global _BN_DECAY, _BN_EPSILON
+    _BN_DECAY = 0.9997
+    _BN_EPSILON = 1e-3
+
+    return NASNet(input_shape,
+                  penultimate_filters=4032,
+                  nb_blocks=6,
+                  stem_filters=96,
+                  skip_reduction=False,
+                  use_auxiliary_branch=use_auxiliary_branch,
+                  filters_multiplier=2,
+                  dropout=dropout,
+                  weight_decay=weight_decay,
+                  include_top=include_top,
+                  weights=weights,
+                  input_tensor=input_tensor,
+                  pooling=pooling,
+                  classes=classes,
+                  default_size=331)
+
+
+def NASNetMobile(input_shape=(224, 224, 3),
+                 dropout=0.5,
+                 weight_decay=4e-5,
+                 use_auxiliary_branch=False,
+                 include_top=True,
+                 weights='imagenet',
+                 input_tensor=None,
+                 pooling=None,
+                 classes=1000):
+    """Instantiates a NASNet architecture in Mobile ImageNet mode.
+    Note that only TensorFlow is supported for now,
+    therefore it only works with the data format
+    `image_data_format='channels_last'` in your Keras config
+    at `~/.keras/keras.json`.
+
+    # Arguments
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(224, 224, 3)` for NASNetMobile
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 32.
+            E.g. `(224, 224, 3)` would be one valid value.
+        use_auxiliary_branch: Whether to use the auxiliary branch during
+            training or evaluation.
+        dropout: dropout rate
+        weight_decay: l2 regularization weight
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: `None` (random initialization) or
+            `imagenet` (ImageNet weights)
+        input_tensor: optional Keras tensor (i.e. output of
+            `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        default_size: specifies the default image size of the model
+    # Returns
+        A Keras model instance.
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+    """
+    global _BN_DECAY, _BN_EPSILON
+    _BN_DECAY = 0.9997
+    _BN_EPSILON = 1e-3
+
+    return NASNet(input_shape,
+                  penultimate_filters=1056,
+                  nb_blocks=4,
+                  stem_filters=32,
+                  skip_reduction=False,
+                  use_auxiliary_branch=use_auxiliary_branch,
+                  filters_multiplier=2,
+                  dropout=dropout,
+                  weight_decay=weight_decay,
+                  include_top=include_top,
+                  weights=weights,
+                  input_tensor=input_tensor,
+                  pooling=pooling,
+                  classes=classes,
+                  default_size=224)
+
+
+def NASNetCIFAR(input_shape=(32, 32, 3),
+                dropout=0.0,
+                weight_decay=5e-4,
+                use_auxiliary_branch=False,
+                include_top=True,
+                weights=None,
+                input_tensor=None,
+                pooling=None,
+                classes=10):
+    """Instantiates a NASNet architecture in CIFAR mode.
+    Note that only TensorFlow is supported for now,
+    therefore it only works with the data format
+    `image_data_format='channels_last'` in your Keras config
+    at `~/.keras/keras.json`.
+
+    # Arguments
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(32, 32, 3)` for NASNetMobile
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 32.
+            E.g. `(32, 32, 3)` would be one valid value.
+        use_auxiliary_branch: Whether to use the auxiliary branch during
+            training or evaluation.
+        dropout: dropout rate
+        weight_decay: l2 regularization weight
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: `None` (random initialization) or
+            `imagenet` (ImageNet weights)
+        input_tensor: optional Keras tensor (i.e. output of
+            `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        default_size: specifies the default image size of the model
+    # Returns
+        A Keras model instance.
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+        RuntimeError: If attempting to run this model with a
+            backend that does not support separable convolutions.
+    """
+    global _BN_DECAY, _BN_EPSILON
+    _BN_DECAY = 0.9
+    _BN_EPSILON = 1e-5
+
+    return NASNet(input_shape,
+                  penultimate_filters=768,
+                  nb_blocks=6,
+                  stem_filters=32,
+                  skip_reduction=True,
+                  use_auxiliary_branch=use_auxiliary_branch,
+                  filters_multiplier=2,
+                  dropout=dropout,
+                  weight_decay=weight_decay,
+                  include_top=include_top,
+                  weights=weights,
+                  input_tensor=input_tensor,
+                  pooling=pooling,
+                  classes=classes,
+                  default_size=224)
+
+
+def _separable_conv_block(ip, filters, kernel_size=(3, 3), strides=(1, 1), weight_decay=5e-5, id=None):
+    '''Adds 2 blocks of [relu-separable conv-batchnorm]
+
+    # Arguments:
+        ip: input tensor
+        filters: number of output filters per layer
+        kernel_size: kernel size of separable convolutions
+        strides: strided convolution for downsampling
+        weight_decay: l2 regularization weight
+        id: string id
+
+    # Returns:
+        a Keras tensor
+    '''
+    channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+
+    with K.name_scope('separable_conv_block_%s' % id):
+        x = Activation('relu')(ip)
+        x = SeparableConv2D(filters, kernel_size, strides=strides, name='separable_conv_1_%s' % id,
+                            padding='same', use_bias=False, kernel_initializer='he_normal',
+                            kernel_regularizer=l2(weight_decay))(x)
+        x = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON,
+                               name="separable_conv_1_bn_%s" % (id))(x)
+        x = Activation('relu')(x)
+        x = SeparableConv2D(filters, kernel_size, name='separable_conv_2_%s' % id,
+                            padding='same', use_bias=False, kernel_initializer='he_normal',
+                            kernel_regularizer=l2(weight_decay))(x)
+        x = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON,
+                               name="separable_conv_2_bn_%s" % (id))(x)
+    return x
+
+
+def _adjust_block(p, ip, filters, weight_decay=5e-5, id=None):
+    '''
+    Adjusts the input `p` to match the shape of the `input`
+    or situations where the output number of filters needs to
+    be changed
+
+    # Arguments:
+        p: input tensor which needs to be modified
+        ip: input tensor whose shape needs to be matched
+        filters: number of output filters to be matched
+        weight_decay: l2 regularization weight
+        id: string id
+
+    # Returns:
+        an adjusted Keras tensor
+    '''
+    channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+    img_dim = 2 if K.image_data_format() == 'channels_first' else -2
+
+    with K.name_scope('adjust_block'):
+        if p is None:
+            p = ip
+
+        elif p._keras_shape[img_dim] != ip._keras_shape[img_dim]:
+            with K.name_scope('adjust_reduction_block_%s' % id):
+                p = Activation('relu', name='adjust_relu_1_%s' % id)(p)
+
+                p1 = AveragePooling2D((1, 1), strides=(2, 2), padding='valid', name='adjust_avg_pool_1_%s' % id)(p)
+                p1 = Conv2D(filters // 2, (1, 1), padding='same', use_bias=False, kernel_regularizer=l2(weight_decay),
+                            name='adjust_conv_1_%s' % id, kernel_initializer='he_normal')(p1)
+
+                p2 = ZeroPadding2D(padding=((0, 1), (0, 1)))(p)
+                p2 = Cropping2D(cropping=((1, 0), (1, 0)))(p2)
+                p2 = AveragePooling2D((1, 1), strides=(2, 2), padding='valid', name='adjust_avg_pool_2_%s' % id)(p2)
+                p2 = Conv2D(filters // 2, (1, 1), padding='same', use_bias=False, kernel_regularizer=l2(weight_decay),
+                            name='adjust_conv_2_%s' % id, kernel_initializer='he_normal')(p2)
+
+                p = concatenate([p1, p2], axis=channel_dim)
+                p = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON,
+                                       name='adjust_bn_%s' % id)(p)
+
+        elif p._keras_shape[channel_dim] != filters:
+            with K.name_scope('adjust_projection_block_%s' % id):
+                p = Activation('relu')(p)
+                p = Conv2D(filters, (1, 1), strides=(1, 1), padding='same', name='adjust_conv_projection_%s' % id,
+                           use_bias=False, kernel_regularizer=l2(weight_decay), kernel_initializer='he_normal')(p)
+                p = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON,
+                                       name='adjust_bn_%s' % id)(p)
+    return p
+
+
+def _normal_A(ip, p, filters, weight_decay=5e-5, id=None):
+    '''Adds a Normal cell for NASNet-A (Fig. 4 in the paper)
+
+    # Arguments:
+        ip: input tensor `x`
+        p: input tensor `p`
+        filters: number of output filters
+        weight_decay: l2 regularization weight
+        id: string id
+
+    # Returns:
+        a Keras tensor
+    '''
+    channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+
+    with K.name_scope('normal_A_block_%s' % id):
+        p = _adjust_block(p, ip, filters, weight_decay, id)
+
+        h = Activation('relu')(ip)
+        h = Conv2D(filters, (1, 1), strides=(1, 1), padding='same', name='normal_conv_1_%s' % id,
+                   use_bias=False, kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(h)
+        h = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON,
+                               name='normal_bn_1_%s' % id)(h)
+
+        with K.name_scope('block_1'):
+            x1_1 = _separable_conv_block(h, filters, kernel_size=(5, 5), weight_decay=weight_decay,
+                                         id='normal_left1_%s' % id)
+            x1_2 = _separable_conv_block(p, filters, weight_decay=weight_decay, id='normal_right1_%s' % id)
+            x1 = add([x1_1, x1_2], name='normal_add_1_%s' % id)
+
+        with K.name_scope('block_2'):
+            x2_1 = _separable_conv_block(p, filters, (5, 5), weight_decay=weight_decay, id='normal_left2_%s' % id)
+            x2_2 = _separable_conv_block(p, filters, (3, 3), weight_decay=weight_decay, id='normal_right2_%s' % id)
+            x2 = add([x2_1, x2_2], name='normal_add_2_%s' % id)
+
+        with K.name_scope('block_3'):
+            x3 = AveragePooling2D((3, 3), strides=(1, 1), padding='same', name='normal_left3_%s' % (id))(h)
+            x3 = add([x3, p], name='normal_add_3_%s' % id)
+
+        with K.name_scope('block_4'):
+            x4_1 = AveragePooling2D((3, 3), strides=(1, 1), padding='same', name='normal_left4_%s' % (id))(p)
+            x4_2 = AveragePooling2D((3, 3), strides=(1, 1), padding='same', name='normal_right4_%s' % (id))(p)
+            x4 = add([x4_1, x4_2], name='normal_add_4_%s' % id)
+
+        with K.name_scope('block_5'):
+            x5 = _separable_conv_block(h, filters, weight_decay=weight_decay, id='normal_left5_%s' % id)
+            x5 = add([x5, h], name='normal_add_5_%s' % id)
+
+        x = concatenate([p, x1, x2, x3, x4, x5], axis=channel_dim, name='normal_concat_%s' % id)
+    return x, ip
+
+
+def _reduction_A(ip, p, filters, weight_decay=5e-5, id=None):
+    '''Adds a Reduction cell for NASNet-A (Fig. 4 in the paper)
+
+    # Arguments:
+        ip: input tensor `x`
+        p: input tensor `p`
+        filters: number of output filters
+        weight_decay: l2 regularization weight
+        id: string id
+
+    # Returns:
+        a Keras tensor
+    '''
+    """"""
+    channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+
+    with K.name_scope('reduction_A_block_%s' % id):
+        p = _adjust_block(p, ip, filters, weight_decay, id)
+
+        h = Activation('relu')(ip)
+        h = Conv2D(filters, (1, 1), strides=(1, 1), padding='same', name='reduction_conv_1_%s' % id,
+                   use_bias=False, kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(h)
+        h = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON,
+                               name='reduction_bn_1_%s' % id)(h)
+
+        with K.name_scope('block_1'):
+            x1_1 = _separable_conv_block(h, filters, (5, 5), strides=(2, 2), weight_decay=weight_decay,
+                                         id='reduction_left1_%s' % id)
+            x1_2 = _separable_conv_block(p, filters, (7, 7), strides=(2, 2), weight_decay=weight_decay,
+                                         id='reduction_1_%s' % id)
+            x1 = add([x1_1, x1_2], name='reduction_add_1_%s' % id)
+
+        with K.name_scope('block_2'):
+            x2_1 = MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='reduction_left2_%s' % id)(h)
+            x2_2 = _separable_conv_block(p, filters, (7, 7), strides=(2, 2), weight_decay=weight_decay,
+                                         id='reduction_right2_%s' % id)
+            x2 = add([x2_1, x2_2], name='reduction_add_2_%s' % id)
+
+        with K.name_scope('block_3'):
+            x3_1 = AveragePooling2D((3, 3), strides=(2, 2), padding='same', name='reduction_left3_%s' % id)(h)
+            x3_2 = _separable_conv_block(p, filters, (5, 5), strides=(2, 2), weight_decay=weight_decay,
+                                         id='reduction_right3_%s' % id)
+            x3 = add([x3_1, x3_2], name='reduction_add3_%s' % id)
+
+        with K.name_scope('block_4'):
+            x4 = AveragePooling2D((3, 3), strides=(1, 1), padding='same', name='reduction_left4_%s' % id)(x1)
+            x4 = add([x2, x4])
+
+        with K.name_scope('block_5'):
+            x5_1 = _separable_conv_block(x1, filters, (3, 3), weight_decay=weight_decay, id='reduction_left4_%s' % id)
+            x5_2 = MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='reduction_right5_%s' % id)(h)
+            x5 = add([x5_1, x5_2], name='reduction_add4_%s' % id)
+
+        x = concatenate([x2, x3, x4, x5], axis=channel_dim, name='reduction_concat_%s' % id)
+        return x, ip
+
+
+def _add_auxiliary_head(x, classes, weight_decay):
+    '''Adds an auxiliary head for training the model
+
+    From section A.7 "Training of ImageNet models" of the paper, all NASNet models are
+    trained using an auxiliary classifier around 2/3 of the depth of the network, with
+    a loss weight of 0.4
+
+    # Arguments
+        x: input tensor
+        classes: number of output classes
+        weight_decay: l2 regularization weight
+
+    # Returns
+        a keras Tensor
+    '''
+    img_height = 1 if K.image_data_format() == 'channels_last' else 2
+    img_width = 2 if K.image_data_format() == 'channels_last' else 3
+    channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
+
+    with K.name_scope('auxiliary_branch'):
+        auxiliary_x = Activation('relu')(x)
+        auxiliary_x = AveragePooling2D((5, 5), strides=(3, 3), padding='valid', name='aux_pool')(auxiliary_x)
+        auxiliary_x = Conv2D(128, (1, 1), padding='same', use_bias=False, name='aux_conv_projection',
+                             kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(auxiliary_x)
+        auxiliary_x = BatchNormalization(axis=channel_axis, momentum=_BN_DECAY, epsilon=_BN_EPSILON,
+                                         name='aux_bn_projection')(auxiliary_x)
+        auxiliary_x = Activation('relu')(auxiliary_x)
+
+        auxiliary_x = Conv2D(768, (auxiliary_x._keras_shape[img_height], auxiliary_x._keras_shape[img_width]),
+                             padding='valid', use_bias=False, kernel_initializer='he_normal',
+                             kernel_regularizer=l2(weight_decay), name='aux_conv_reduction')(auxiliary_x)
+        auxiliary_x = BatchNormalization(axis=channel_axis, momentum=_BN_DECAY, epsilon=_BN_EPSILON,
+                                         name='aux_bn_reduction')(auxiliary_x)
+        auxiliary_x = Activation('relu')(auxiliary_x)
+
+        auxiliary_x = GlobalAveragePooling2D()(auxiliary_x)
+        auxiliary_x = Dense(classes, activation='softmax', kernel_regularizer=l2(weight_decay),
+                            name='aux_predictions')(auxiliary_x)
+    return auxiliary_x
diff --git a/keras_contrib/applications/resnet.py b/keras_contrib/applications/resnet.py
new file mode 100644
index 0000000..743922c
--- /dev/null
+++ b/keras_contrib/applications/resnet.py
@@ -0,0 +1,454 @@
+"""ResNet v1, v2, and segmentation models for Keras.
+
+# Reference
+
+- [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)
+- [Identity Mappings in Deep Residual Networks](https://arxiv.org/abs/1603.05027)
+
+Reference material for extended functionality:
+
+- [ResNeXt](https://arxiv.org/abs/1611.05431) for Tiny ImageNet support.
+- [Dilated Residual Networks](https://arxiv.org/pdf/1705.09914) for segmentation support.
+- [Deep Residual Learning for Instrument Segmentation in Robotic Surgery](https://arxiv.org/abs/1703.08580)
+  for segmentation support.
+
+Implementation Adapted from: github.com/raghakot/keras-resnet
+"""
+from __future__ import division
+
+import six
+from keras.models import Model
+from keras.layers import Input
+from keras.layers import Activation
+from keras.layers import Reshape
+from keras.layers import Dense
+from keras.layers import Flatten
+from keras.layers import Conv2D
+from keras.layers import MaxPooling2D
+from keras.layers import AveragePooling2D
+from keras.layers.pooling import GlobalAveragePooling2D
+from keras.layers import GlobalMaxPooling2D
+from keras.layers import GlobalAveragePooling2D
+from keras.layers import Dropout
+from keras.layers.merge import add
+from keras.layers.normalization import BatchNormalization
+from keras.regularizers import l2
+from keras import backend as K
+from keras.applications.imagenet_utils import _obtain_input_shape
+
+
+def _bn_relu(x, bn_name=None, relu_name=None):
+    """Helper to build a BN -> relu block
+    """
+    norm = BatchNormalization(axis=CHANNEL_AXIS, name=bn_name)(x)
+    return Activation("relu", name=relu_name)(norm)
+
+
+def _conv_bn_relu(**conv_params):
+    """Helper to build a conv -> BN -> relu residual unit activation function.
+       This is the original ResNet v1 scheme in https://arxiv.org/abs/1512.03385
+    """
+    filters = conv_params["filters"]
+    kernel_size = conv_params["kernel_size"]
+    strides = conv_params.setdefault("strides", (1, 1))
+    dilation_rate = conv_params.setdefault("dilation_rate", (1, 1))
+    conv_name = conv_params.setdefault("conv_name", None)
+    bn_name = conv_params.setdefault("bn_name", None)
+    relu_name = conv_params.setdefault("relu_name", None)
+    kernel_initializer = conv_params.setdefault("kernel_initializer", "he_normal")
+    padding = conv_params.setdefault("padding", "same")
+    kernel_regularizer = conv_params.setdefault("kernel_regularizer", l2(1.e-4))
+
+    def f(x):
+        x = Conv2D(filters=filters, kernel_size=kernel_size,
+                   strides=strides, padding=padding,
+                   dilation_rate=dilation_rate,
+                   kernel_initializer=kernel_initializer,
+                   kernel_regularizer=kernel_regularizer,
+                   name=conv_name)(x)
+        return _bn_relu(x, bn_name=bn_name, relu_name=relu_name)
+
+    return f
+
+
+def _bn_relu_conv(**conv_params):
+    """Helper to build a BN -> relu -> conv residual unit with full pre-activation function.
+    This is the ResNet v2 scheme proposed in http://arxiv.org/pdf/1603.05027v2.pdf
+    """
+    filters = conv_params["filters"]
+    kernel_size = conv_params["kernel_size"]
+    strides = conv_params.setdefault("strides", (1, 1))
+    dilation_rate = conv_params.setdefault("dilation_rate", (1, 1))
+    conv_name = conv_params.setdefault("conv_name", None)
+    bn_name = conv_params.setdefault("bn_name", None)
+    relu_name = conv_params.setdefault("relu_name", None)
+    kernel_initializer = conv_params.setdefault("kernel_initializer", "he_normal")
+    padding = conv_params.setdefault("padding", "same")
+    kernel_regularizer = conv_params.setdefault("kernel_regularizer", l2(1.e-4))
+
+    def f(x):
+        activation = _bn_relu(x, bn_name=bn_name, relu_name=relu_name)
+        return Conv2D(filters=filters, kernel_size=kernel_size,
+                      strides=strides, padding=padding,
+                      dilation_rate=dilation_rate,
+                      kernel_initializer=kernel_initializer,
+                      kernel_regularizer=kernel_regularizer,
+                      name=conv_name)(activation)
+
+    return f
+
+
+def _shortcut(input_feature, residual, conv_name_base=None, bn_name_base=None):
+    """Adds a shortcut between input and residual block and merges them with "sum"
+    """
+    # Expand channels of shortcut to match residual.
+    # Stride appropriately to match residual (width, height)
+    # Should be int if network architecture is correctly configured.
+    input_shape = K.int_shape(input_feature)
+    residual_shape = K.int_shape(residual)
+    stride_width = int(round(input_shape[ROW_AXIS] / residual_shape[ROW_AXIS]))
+    stride_height = int(round(input_shape[COL_AXIS] / residual_shape[COL_AXIS]))
+    equal_channels = input_shape[CHANNEL_AXIS] == residual_shape[CHANNEL_AXIS]
+
+    shortcut = input_feature
+    # 1 X 1 conv if shape is different. Else identity.
+    if stride_width > 1 or stride_height > 1 or not equal_channels:
+        print('reshaping via a convolution...')
+        if conv_name_base is not None:
+            conv_name_base = conv_name_base + '1'
+        shortcut = Conv2D(filters=residual_shape[CHANNEL_AXIS],
+                          kernel_size=(1, 1),
+                          strides=(stride_width, stride_height),
+                          padding="valid",
+                          kernel_initializer="he_normal",
+                          kernel_regularizer=l2(0.0001),
+                          name=conv_name_base)(input_feature)
+        if bn_name_base is not None:
+            bn_name_base = bn_name_base + '1'
+        shortcut = BatchNormalization(axis=CHANNEL_AXIS, name=bn_name_base)(shortcut)
+
+    return add([shortcut, residual])
+
+
+def _residual_block(block_function, filters, blocks, stage,
+                    transition_strides=None, transition_dilation_rates=None,
+                    dilation_rates=(1, 1), is_first_layer=False, dropout=None,
+                    residual_unit=_bn_relu_conv):
+    """Builds a residual block with repeating bottleneck blocks.
+
+       stage: integer, current stage label, used for generating layer names
+       blocks: number of blocks 'a','b'..., current block label, used for generating layer names
+       transition_strides: a list of tuples for the strides of each transition
+       transition_dilation_rates: a list of tuples for the dilation rate of each transition
+    """
+    if transition_dilation_rates is None:
+        transition_dilation_rates = [(1, 1)] * blocks
+    if transition_strides is None:
+        transition_strides = [(1, 1)] * blocks
+
+    def f(x):
+        for i in range(blocks):
+            x = block_function(filters=filters, stage=stage, block=i,
+                               transition_strides=transition_strides[i],
+                               dilation_rate=dilation_rates[i],
+                               is_first_block_of_first_layer=(is_first_layer and i == 0),
+                               dropout=dropout,
+                               residual_unit=residual_unit)(x)
+        return x
+
+    return f
+
+
+def _block_name_base(stage, block):
+    """Get the convolution name base and batch normalization name base defined by stage and block.
+
+    If there are less than 26 blocks they will be labeled 'a', 'b', 'c' to match the paper and keras
+    and beyond 26 blocks they will simply be numbered.
+    """
+    if block < 27:
+        block = '%c' % (block + 97)  # 97 is the ascii number for lowercase 'a'
+    conv_name_base = 'res' + str(stage) + block + '_branch'
+    bn_name_base = 'bn' + str(stage) + block + '_branch'
+    return conv_name_base, bn_name_base
+
+
+def basic_block(filters, stage, block, transition_strides=(1, 1),
+                dilation_rate=(1, 1), is_first_block_of_first_layer=False, dropout=None,
+                residual_unit=_bn_relu_conv):
+    """Basic 3 X 3 convolution blocks for use on resnets with layers <= 34.
+    Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf
+    """
+    def f(input_features):
+        conv_name_base, bn_name_base = _block_name_base(stage, block)
+        if is_first_block_of_first_layer:
+            # don't repeat bn->relu since we just did bn->relu->maxpool
+            x = Conv2D(filters=filters, kernel_size=(3, 3),
+                       strides=transition_strides,
+                       dilation_rate=dilation_rate,
+                       padding="same",
+                       kernel_initializer="he_normal",
+                       kernel_regularizer=l2(1e-4),
+                       name=conv_name_base + '2a')(input_features)
+        else:
+            x = residual_unit(filters=filters, kernel_size=(3, 3),
+                              strides=transition_strides,
+                              dilation_rate=dilation_rate,
+                              conv_name_base=conv_name_base + '2a',
+                              bn_name_base=bn_name_base + '2a')(input_features)
+
+        if dropout is not None:
+            x = Dropout(dropout)(x)
+
+        x = residual_unit(filters=filters, kernel_size=(3, 3),
+                          conv_name_base=conv_name_base + '2b',
+                          bn_name_base=bn_name_base + '2b')(x)
+
+        return _shortcut(input_features, x)
+
+    return f
+
+
+def bottleneck(filters, stage, block, transition_strides=(1, 1),
+               dilation_rate=(1, 1), is_first_block_of_first_layer=False, dropout=None,
+               residual_unit=_bn_relu_conv):
+    """Bottleneck architecture for > 34 layer resnet.
+    Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf
+
+    Returns:
+        A final conv layer of filters * 4
+    """
+    def f(input_feature):
+        conv_name_base, bn_name_base = _block_name_base(stage, block)
+        if is_first_block_of_first_layer:
+            # don't repeat bn->relu since we just did bn->relu->maxpool
+            x = Conv2D(filters=filters, kernel_size=(1, 1),
+                       strides=transition_strides,
+                       dilation_rate=dilation_rate,
+                       padding="same",
+                       kernel_initializer="he_normal",
+                       kernel_regularizer=l2(1e-4),
+                       name=conv_name_base + '2a')(input_feature)
+        else:
+            x = residual_unit(filters=filters, kernel_size=(1, 1),
+                              strides=transition_strides,
+                              dilation_rate=dilation_rate,
+                              conv_name_base=conv_name_base + '2a',
+                              bn_name_base=bn_name_base + '2a')(input_feature)
+
+        if dropout is not None:
+            x = Dropout(dropout)(x)
+
+        x = residual_unit(filters=filters, kernel_size=(3, 3),
+                          conv_name_base=conv_name_base + '2b',
+                          bn_name_base=bn_name_base + '2b')(x)
+
+        if dropout is not None:
+            x = Dropout(dropout)(x)
+
+        x = residual_unit(filters=filters * 4, kernel_size=(1, 1),
+                          conv_name_base=conv_name_base + '2c',
+                          bn_name_base=bn_name_base + '2c')(x)
+
+        return _shortcut(input_feature, x)
+
+    return f
+
+
+def _handle_dim_ordering():
+    global ROW_AXIS
+    global COL_AXIS
+    global CHANNEL_AXIS
+    if K.image_data_format() == 'channels_last':
+        ROW_AXIS = 1
+        COL_AXIS = 2
+        CHANNEL_AXIS = 3
+    else:
+        CHANNEL_AXIS = 1
+        ROW_AXIS = 2
+        COL_AXIS = 3
+
+
+def _string_to_function(identifier):
+    if isinstance(identifier, six.string_types):
+        res = globals().get(identifier)
+        if not res:
+            raise ValueError('Invalid {}'.format(identifier))
+        return res
+    return identifier
+
+
+def ResNet(input_shape=None, classes=10, block='bottleneck', residual_unit='v2', repetitions=None,
+           initial_filters=64, activation='softmax', include_top=True, input_tensor=None, dropout=None,
+           transition_dilation_rate=(1, 1), initial_strides=(2, 2), initial_kernel_size=(7, 7),
+           initial_pooling='max', final_pooling=None, top='classification'):
+    """Builds a custom ResNet like architecture. Defaults to ResNet50 v2.
+
+    Args:
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(224, 224, 3)` (with `channels_last` dim ordering)
+            or `(3, 224, 224)` (with `channels_first` dim ordering).
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 8.
+            E.g. `(224, 224, 3)` would be one valid value.
+        classes: The number of outputs at final softmax layer
+        block: The block function to use. This is either `'basic'` or `'bottleneck'`.
+            The original paper used `basic` for layers < 50.
+        repetitions: Number of repetitions of various block units.
+            At each block unit, the number of filters are doubled and the input size is halved.
+            Default of None implies the ResNet50v2 values of [3, 4, 6, 3].
+        transition_dilation_rate: Used for pixel-wise prediction tasks such as image segmentation.
+        residual_unit: the basic residual unit, 'v1' for conv bn relu, 'v2' for bn relu conv.
+            See [Identity Mappings in Deep Residual Networks](https://arxiv.org/abs/1603.05027)
+            for details.
+        dropout: None for no dropout, otherwise rate of dropout from 0 to 1.
+            Based on [Wide Residual Networks.(https://arxiv.org/pdf/1605.07146) paper.
+        transition_dilation_rate: Dilation rate for transition layers. For semantic
+            segmentation of images use a dilation rate of (2, 2).
+        initial_strides: Stride of the very first residual unit and MaxPooling2D call,
+            with default (2, 2), set to (1, 1) for small images like cifar.
+        initial_kernel_size: kernel size of the very first convolution, (7, 7) for imagenet
+            and (3, 3) for small image datasets like tiny imagenet and cifar.
+            See [ResNeXt](https://arxiv.org/abs/1611.05431) paper for details.
+        initial_pooling: Determine if there will be an initial pooling layer,
+            'max' for imagenet and None for small image datasets.
+            See [ResNeXt](https://arxiv.org/abs/1611.05431) paper for details.
+        final_pooling: Optional pooling mode for feature extraction at the final model layer
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        top: Defines final layers to evaluate based on a specific problem type. Options are
+            'classification' for ImageNet style problems, 'segmentation' for problems like
+            the Pascal VOC dataset, and None to exclude these layers entirely.
+
+    Returns:
+        The keras `Model`.
+    """
+    if activation not in ['softmax', 'sigmoid', None]:
+        raise ValueError('activation must be one of "softmax", "sigmoid", or None')
+    if activation == 'sigmoid' and classes != 1:
+        raise ValueError('sigmoid activation can only be used when classes = 1')
+    if repetitions is None:
+        repetitions = [3, 4, 6, 3]
+    # Determine proper input shape
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=32,
+                                      min_size=8,
+                                      data_format=K.image_data_format(),
+                                      require_flatten=include_top)
+    _handle_dim_ordering()
+    if len(input_shape) != 3:
+        raise Exception("Input shape should be a tuple (nb_channels, nb_rows, nb_cols)")
+
+    if block == 'basic':
+        block_fn = basic_block
+    elif block == 'bottleneck':
+        block_fn = bottleneck
+    elif isinstance(block, six.string_types):
+        block_fn = _string_to_function(block)
+    else:
+        block_fn = block
+
+    if residual_unit == 'v2':
+        residual_unit = _bn_relu_conv
+    elif residual_unit == 'v1':
+        residual_unit = _conv_bn_relu
+    elif isinstance(residual_unit, six.string_types):
+        residual_unit = _string_to_function(residual_unit)
+    else:
+        residual_unit = residual_unit
+
+    # Permute dimension order if necessary
+    if K.image_data_format() == 'channels_first':
+        input_shape = (input_shape[1], input_shape[2], input_shape[0])
+    # Determine proper input shape
+    input_shape = _obtain_input_shape(input_shape,
+                                      default_size=32,
+                                      min_size=8,
+                                      data_format=K.image_data_format(),
+                                      require_flatten=include_top)
+
+    img_input = Input(shape=input_shape, tensor=input_tensor)
+    x = _conv_bn_relu(filters=initial_filters, kernel_size=initial_kernel_size, strides=initial_strides)(img_input)
+    if initial_pooling == 'max':
+        x = MaxPooling2D(pool_size=(3, 3), strides=initial_strides, padding="same")(x)
+
+    block = x
+    filters = initial_filters
+    for i, r in enumerate(repetitions):
+        transition_dilation_rates = [transition_dilation_rate] * r
+        transition_strides = [(1, 1)] * r
+        if transition_dilation_rate == (1, 1):
+            transition_strides[0] = (2, 2)
+        block = _residual_block(block_fn, filters=filters,
+                                stage=i, blocks=r,
+                                is_first_layer=(i == 0),
+                                dropout=dropout,
+                                transition_dilation_rates=transition_dilation_rates,
+                                transition_strides=transition_strides,
+                                residual_unit=residual_unit)(block)
+        filters *= 2
+
+    # Last activation
+    x = _bn_relu(block)
+
+    # Classifier block
+    if include_top and top is 'classification':
+        x = GlobalAveragePooling2D()(x)
+        x = Dense(units=classes, activation=activation, kernel_initializer="he_normal")(x)
+    elif include_top and top is 'segmentation':
+        x = Conv2D(classes, (1, 1), activation='linear', padding='same')(x)
+
+        if K.image_data_format() == 'channels_first':
+            channel, row, col = input_shape
+        else:
+            row, col, channel = input_shape
+
+        x = Reshape((row * col, classes))(x)
+        x = Activation(activation)(x)
+        x = Reshape((row, col, classes))(x)
+    elif final_pooling == 'avg':
+        x = GlobalAveragePooling2D()(x)
+    elif final_pooling == 'max':
+        x = GlobalMaxPooling2D()(x)
+
+    model = Model(inputs=img_input, outputs=x)
+    return model
+
+
+def ResNet18(input_shape, classes):
+    """ResNet with 18 layers and v2 residual units
+    """
+    return ResNet(input_shape, classes, basic_block, repetitions=[2, 2, 2, 2])
+
+
+def ResNet34(input_shape, classes):
+    """ResNet with 34 layers and v2 residual units
+    """
+    return ResNet(input_shape, classes, basic_block, repetitions=[3, 4, 6, 3])
+
+
+def ResNet50(input_shape, classes):
+    """ResNet with 50 layers and v2 residual units
+    """
+    return ResNet(input_shape, classes, bottleneck, repetitions=[3, 4, 6, 3])
+
+
+def ResNet101(input_shape, classes):
+    """ResNet with 101 layers and v2 residual units
+    """
+    return ResNet(input_shape, classes, bottleneck, repetitions=[3, 4, 23, 3])
+
+
+def ResNet152(input_shape, classes):
+    """ResNet with 152 layers and v2 residual units
+    """
+    return ResNet(input_shape, classes, bottleneck, repetitions=[3, 8, 36, 3])
diff --git a/keras_contrib/applications/wide_resnet.py b/keras_contrib/applications/wide_resnet.py
index 647d63a..5df99f8 100644
--- a/keras_contrib/applications/wide_resnet.py
+++ b/keras_contrib/applications/wide_resnet.py
@@ -89,7 +89,7 @@ def WideResidualNetwork(depth=28, width=8, dropout_rate=0.0,
                                       default_size=32,
                                       min_size=8,
                                       data_format=K.image_dim_ordering(),
-                                      include_top=include_top)
+                                      require_flatten=include_top)
 
     if input_tensor is None:
         img_input = Input(shape=input_shape)
diff --git a/keras_contrib/backend/cntk_backend.py b/keras_contrib/backend/cntk_backend.py
index 363ad08..624aeee 100644
--- a/keras_contrib/backend/cntk_backend.py
+++ b/keras_contrib/backend/cntk_backend.py
@@ -1,2 +1,26 @@
 from keras.backend import cntk_backend as KCN
 import cntk as C
+import numpy as np
+
+
+def clip(x, min_value, max_value):
+    """Element-wise value clipping.
+
+    If min_value > max_value, clipping range is [min_value,min_value].
+
+    # Arguments
+        x: Tensor or variable.
+        min_value: Tensor, float, int, or None.
+            If min_value is None, defaults to -infinity.
+        max_value: Tensor, float, int, or None.
+            If max_value is None, defaults to infinity.
+
+    # Returns
+        A tensor.
+    """
+    if max_value is None:
+        max_value = np.inf
+    if min_value is None:
+        min_value = -np.inf
+    max_value = C.maximum(min_value, max_value)
+    return C.clip(x, min_value, max_value)
diff --git a/keras_contrib/backend/tensorflow_backend.py b/keras_contrib/backend/tensorflow_backend.py
index 7b69687..284cbe4 100644
--- a/keras_contrib/backend/tensorflow_backend.py
+++ b/keras_contrib/backend/tensorflow_backend.py
@@ -1,28 +1,71 @@
 import tensorflow as tf
+import numpy as np
 
 try:
     from tensorflow.python.ops import ctc_ops as ctc
 except ImportError:
     import tensorflow.contrib.ctc as ctc
 from keras.backend import tensorflow_backend as KTF
-from keras.backend.common import floatx, image_data_format
-from keras.backend.tensorflow_backend import _preprocess_conv3d_input
-from keras.backend.tensorflow_backend import _postprocess_conv3d_output
-from keras.backend.tensorflow_backend import _preprocess_padding
-from keras.backend.tensorflow_backend import _preprocess_conv2d_input
-from keras.backend.tensorflow_backend import _postprocess_conv2d_output
+from keras.backend import dtype
+from keras.backend.common import floatx
+from keras.backend.common import image_data_format
+from keras.backend.tensorflow_backend import _to_tensor
 
 py_all = all
 
 
-def _preprocess_deconv_output_shape(x, shape, data_format):
+def _preprocess_conv2d_input(x, data_format):
+    """Transpose and cast the input before the conv2d.
+    # Arguments
+        x: input tensor.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+    # Returns
+        A tensor.
+    """
+    if dtype(x) == 'float64':
+        x = tf.cast(x, 'float32')
     if data_format == 'channels_first':
-        shape = (shape[0],) + tuple(shape[2:]) + (shape[1],)
+        # TF uses the last dimension as channel dimension,
+        # instead of the 2nd one.
+        # TH input shape: (samples, input_depth, rows, cols)
+        # TF input shape: (samples, rows, cols, input_depth)
+        x = tf.transpose(x, (0, 2, 3, 1))
+    return x
 
-    if shape[0] is None:
-        shape = (tf.shape(x)[0],) + tuple(shape[1:])
-        shape = tf.stack(list(shape))
-    return shape
+
+def _postprocess_conv2d_output(x, data_format):
+    """Transpose and cast the output from conv2d if needed.
+    # Arguments
+        x: A tensor.
+        data_format: string, `"channels_last"` or `"channels_first"`.
+    # Returns
+        A tensor.
+    """
+
+    if data_format == 'channels_first':
+        x = tf.transpose(x, (0, 3, 1, 2))
+
+    if floatx() == 'float64':
+        x = tf.cast(x, 'float64')
+    return x
+
+
+def _preprocess_padding(padding):
+    """Convert keras' padding to tensorflow's padding.
+    # Arguments
+        padding: string, `"same"` or `"valid"`.
+    # Returns
+        a string, `"SAME"` or `"VALID"`.
+    # Raises
+        ValueError: if `padding` is invalid.
+    """
+    if padding == 'same':
+        padding = 'SAME'
+    elif padding == 'valid':
+        padding = 'VALID'
+    else:
+        raise ValueError('Invalid padding:', padding)
+    return padding
 
 
 def conv2d(x, kernel, strides=(1, 1), padding='valid', data_format='channels_first',
@@ -70,45 +113,6 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid', data_format='channels_fir
     return x
 
 
-def deconv3d(x, kernel, output_shape, strides=(1, 1, 1),
-             padding='valid',
-             data_format='default',
-             image_shape=None, filter_shape=None):
-    '''3D deconvolution (i.e. transposed convolution).
-
-    # Arguments
-        x: input tensor.
-        kernel: kernel tensor.
-        output_shape: 1D int tensor for the output shape.
-        strides: strides tuple.
-        padding: string, "same" or "valid".
-        data_format: "tf" or "th".
-            Whether to use Theano or TensorFlow dimension ordering
-            for inputs/kernels/ouputs.
-
-    # Returns
-        A tensor, result of transposed 3D convolution.
-
-    # Raises
-        ValueError: if `data_format` is neither `tf` or `th`.
-    '''
-    if data_format == 'default':
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
-
-    x = _preprocess_conv3d_input(x, data_format)
-    output_shape = _preprocess_deconv_output_shape(x, output_shape,
-                                                   data_format)
-    kernel = tf.transpose(kernel, (0, 1, 2, 4, 3))
-    padding = _preprocess_padding(padding)
-    strides = (1,) + strides + (1,)
-
-    x = tf.nn.conv3d_transpose(x, kernel, output_shape, strides,
-                               padding=padding)
-    return _postprocess_conv3d_output(x, data_format)
-
-
 def extract_image_patches(x, ksizes, ssizes, padding='same',
                           data_format='channels_last'):
     '''
@@ -158,3 +162,28 @@ def moments(x, axes, shift=None, keep_dims=False):
     ''' Wrapper over tensorflow backend call '''
 
     return tf.nn.moments(x, axes, shift=shift, keep_dims=keep_dims)
+
+
+def clip(x, min_value, max_value):
+    """Element-wise value clipping.
+
+    If min_value > max_value, clipping range is [min_value,min_value].
+
+    # Arguments
+        x: Tensor or variable.
+        min_value: Tensor, float, int, or None.
+            If min_value is None, defaults to -infinity.
+        max_value: Tensor, float, int, or None.
+            If max_value is None, defaults to infinity.
+
+    # Returns
+        A tensor.
+    """
+    if max_value is None:
+        max_value = np.inf
+    if min_value is None:
+        min_value = -np.inf
+    min_value = _to_tensor(min_value, x.dtype.base_dtype)
+    max_value = _to_tensor(max_value, x.dtype.base_dtype)
+    max_value = tf.maximum(min_value, max_value)
+    return tf.clip_by_value(x, min_value, max_value)
diff --git a/keras_contrib/backend/theano_backend.py b/keras_contrib/backend/theano_backend.py
index 2b5adaf..78af0ef 100644
--- a/keras_contrib/backend/theano_backend.py
+++ b/keras_contrib/backend/theano_backend.py
@@ -1,5 +1,6 @@
 from theano import tensor as T
 from theano.sandbox.neighbours import images2neibs
+import numpy as np
 
 try:
     import theano.sparse as th_sparse_module
@@ -85,56 +86,6 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid', data_format='channels_fir
     return conv_out
 
 
-def deconv3d(x, kernel, output_shape, strides=(1, 1, 1),
-             padding='valid',
-             data_format=None, filter_shape=None):
-    '''3D deconvolution (transposed convolution).
-
-    # Arguments
-        kernel: kernel tensor.
-        output_shape: desired dimensions of output.
-        strides: strides tuple.
-        padding: string, "same" or "valid".
-        data_format: "channels_last" or "channels_first".
-            Whether to use Theano or TensorFlow dimension ordering
-        in inputs/kernels/ouputs.
-    '''
-    flip_filters = False
-    if data_format is None:
-        data_format = image_data_format()
-    if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format: ' + str(data_format))
-
-    if data_format == 'channels_last':
-        output_shape = (output_shape[0], output_shape[4], output_shape[1],
-                        output_shape[2], output_shape[3])
-
-    x = _preprocess_conv3d_input(x, data_format)
-    kernel = _preprocess_conv3d_kernel(kernel, data_format)
-    kernel = kernel.dimshuffle((1, 0, 2, 3, 4))
-    th_padding = _preprocess_padding(padding)
-
-    if hasattr(kernel, '_keras_shape'):
-        kernel_shape = kernel._keras_shape
-    else:
-        # Will only work if `kernel` is a shared variable.
-        kernel_shape = kernel.eval().shape
-
-    filter_shape = _preprocess_conv3d_filter_shape(filter_shape, data_format)
-    filter_shape = tuple(filter_shape[i] for i in (1, 0, 2, 3, 4))
-
-    conv_out = T.nnet.abstract_conv.conv3d_grad_wrt_inputs(
-        x, kernel, output_shape,
-        filter_shape=filter_shape,
-        border_mode=th_padding,
-        subsample=strides,
-        filter_flip=not flip_filters)
-
-    conv_out = _postprocess_conv3d_output(conv_out, x, padding,
-                                          kernel_shape, strides, data_format)
-    return conv_out
-
-
 def extract_image_patches(X, ksizes, strides, padding='valid', data_format='channels_first'):
     '''
     Extract the patches from an image
@@ -197,3 +148,26 @@ def moments(x, axes, shift=None, keep_dims=False):
     var_batch = KTH.var(x, axis=axes, keepdims=keep_dims)
 
     return mean_batch, var_batch
+
+
+def clip(x, min_value, max_value):
+    """Element-wise value clipping.
+
+    If min_value > max_value, clipping range is [min_value,min_value].
+
+    # Arguments
+        x: Tensor or variable.
+        min_value: Tensor, float, int, or None.
+            If min_value is None, defaults to -infinity.
+        max_value: Tensor, float, int, or None.
+            If max_value is None, defaults to infinity.
+
+    # Returns
+        A tensor.
+    """
+    if max_value is None:
+        max_value = np.inf
+    if min_value is None:
+        min_value = -np.inf
+    max_value = T.maximum(min_value, max_value)
+    return T.clip(x, min_value, max_value)
diff --git a/keras_contrib/callbacks/dead_relu_detector.py b/keras_contrib/callbacks/dead_relu_detector.py
index 2019f56..2cfe37b 100644
--- a/keras_contrib/callbacks/dead_relu_detector.py
+++ b/keras_contrib/callbacks/dead_relu_detector.py
@@ -1,8 +1,6 @@
 import numpy as np
-import warnings
 
 from keras.callbacks import Callback
-from keras.layers import Dense
 from keras import backend as K
 
 
@@ -13,10 +11,11 @@ class DeadReluDetector(Callback):
     # Arguments
         x_train: Training dataset to check whether or not neurons fire
         verbose: verbosity mode
-            True means that even a single dead neuron triggers warning
+            True means that even a single dead neuron triggers a warning message
             False means that only significant number of dead neurons (10% or more)
-            triggers warning
+            triggers a warning message
     """
+
     def __init__(self, x_train, verbose=False):
         super(DeadReluDetector, self).__init__()
         self.x_train = x_train
@@ -25,7 +24,8 @@ class DeadReluDetector(Callback):
 
     @staticmethod
     def is_relu_layer(layer):
-        return isinstance(layer, Dense) and layer.get_config()['activation'] == 'relu'
+        # Should work for all layers with relu activation. Tested for Dense and Conv2D
+        return 'activation' in layer.get_config() and layer.get_config()['activation'] == 'relu'
 
     def get_relu_activations(self):
         model_input = self.model.input
@@ -44,17 +44,43 @@ class DeadReluDetector(Callback):
         layer_outputs = [func(list_inputs)[0] for func in funcs]
         for layer_index, layer_activations in enumerate(layer_outputs):
             if self.is_relu_layer(self.model.layers[layer_index]):
-                yield [layer_index, layer_activations]
+                layer_name = self.model.layers[layer_index].name
+                # layer_weight is a list [W] (+ [b])
+                layer_weight = self.model.layers[layer_index].get_weights()
+                # with kernel and bias, the weights are saved as a list [W, b]. If only weights, it is [W]
+                if type(layer_weight) is not list:
+                    raise ValueError("'Layer_weight' should be a list, but was {}".format(type(layer_weight)))
+
+                layer_weight_shape = np.shape(layer_weight[0])
+                yield [layer_index, layer_activations, layer_name, layer_weight_shape]
 
     def on_epoch_end(self, epoch, logs={}):
         for relu_activation in self.get_relu_activations():
-            layer_index, activation_values = relu_activation
-            total_neurons = activation_values.shape[-1]
-            dead_neurons = np.sum(activation_values == 0)
-            dead_neurons_share = dead_neurons / total_neurons
-            if (self.verbose and dead_neurons > 0) or dead_neurons_share > self.dead_neurons_share_threshold:
-                warnings.warn(
-                    'Layer #{} has {} dead neurons ({:.2%})!'
-                        .format(layer_index, dead_neurons, dead_neurons_share),
-                    RuntimeWarning
-                )
+            layer_index, activation_values, layer_name, layer_weight_shape = relu_activation
+
+            shape_act = activation_values.shape
+
+            weight_len = len(layer_weight_shape)
+            act_len = len(shape_act)
+
+            # should work for both Conv and Flat
+            if K.image_data_format() == 'channels_last':
+                # features in last axis
+                axis_filter = -1
+            else:
+                # features before the convolution axis, for weight_len the input and output have to be subtracted
+                axis_filter = -1 - (weight_len - 2)
+
+            total_featuremaps = shape_act[axis_filter]
+
+            axis = tuple(
+                i for i in range(act_len) if (i != axis_filter) and (i != (len(shape_act) + axis_filter)))
+
+            dead_neurons = np.sum(np.sum(activation_values, axis=axis) == 0)
+
+            dead_neurons_share = float(dead_neurons) / float(total_featuremaps)
+            if (self.verbose and dead_neurons > 0) or dead_neurons_share >= self.dead_neurons_share_threshold:
+                str_warning = 'Layer {} (#{}) has {} dead neurons ({:.2%})!'.format(layer_name, layer_index,
+                                                                                    dead_neurons, dead_neurons_share)
+
+                print(str_warning)
diff --git a/keras_contrib/datasets/conll2000.py b/keras_contrib/datasets/conll2000.py
old mode 100644
new mode 100755
index 22a97e1..5561f17
--- a/keras_contrib/datasets/conll2000.py
+++ b/keras_contrib/datasets/conll2000.py
@@ -16,7 +16,7 @@ def load_data(path='conll2000.zip', min_freq=2):
     archive.close()
 
     word_counts = Counter(row[0].lower() for sample in train for row in sample)
-    vocab = ['<pad>', '<unk>'] + [w for w, f in word_counts.iteritems() if f >= min_freq]
+    vocab = ['<pad>', '<unk>'] + [w for w, f in iter(word_counts.items()) if f >= min_freq]
     pos_tags = sorted(list(set(row[1] for sample in train + test for row in sample)))  # in alphabetic order
     chunk_tags = sorted(list(set(row[2] for sample in train + test for row in sample)))  # in alphabetic order
 
@@ -27,7 +27,7 @@ def load_data(path='conll2000.zip', min_freq=2):
 
 def _parse_data(fh):
     string = fh.read()
-    data = [[row.split() for row in sample.split('\n')] for sample in string.strip().split('\n\n')]
+    data = [[row.split() for row in sample.split('\n')] for sample in string.decode().strip().split('\n\n')]
     fh.close()
     return data
 
diff --git a/keras_contrib/layers/advanced_activations.py b/keras_contrib/layers/advanced_activations.py
index 179856c..7bf349e 100644
--- a/keras_contrib/layers/advanced_activations.py
+++ b/keras_contrib/layers/advanced_activations.py
@@ -236,3 +236,50 @@ class SReLU(Layer):
         return dict(list(base_config.items()) + list(config.items()))
 
 get_custom_objects().update({'SReLU': SReLU})
+
+
+class Swish(Layer):
+    """ Swish (Ramachandranet al., 2017)
+
+    # Input shape
+        Arbitrary. Use the keyword argument `input_shape`
+        (tuple of integers, does not include the samples axis)
+        when using this layer as the first layer in a model.
+
+    # Output shape
+        Same shape as the input.
+
+    # Arguments
+        beta: float >= 0. Scaling factor
+            if set to 1 and trainable set to False (default), Swish equals the SiLU activation (Elfwing et al., 2017)
+        trainable: whether to learn the scaling factor during training or not
+
+    # References
+        - [Searching for Activation Functions](https://arxiv.org/abs/1710.05941)
+        - [Sigmoid-weighted linear units for neural network function approximation in reinforcement learning](https://arxiv.org/abs/1702.03118)
+    """
+
+    def __init__(self, beta=1.0, trainable=False, **kwargs):
+        super(Swish, self).__init__(**kwargs)
+        self.supports_masking = True
+        self.beta = beta
+        self.trainable = trainable
+
+    def build(self, input_shape):
+        self.scaling_factor = K.variable(self.beta,
+                                         dtype=K.floatx(),
+                                         name='scaling_factor')
+        if self.trainable:
+            self._trainable_weights.append(self.scaling_factor)
+        super(Swish, self).build(input_shape)
+
+    def call(self, inputs, mask=None):
+        return inputs * K.sigmoid(self.scaling_factor * inputs)
+
+    def get_config(self):
+        config = {'beta': self.get_weights()[0] if self.trainable else self.beta,
+                  'trainable': self.trainable}
+        base_config = super(Swish, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+get_custom_objects().update({'Swish': Swish})
diff --git a/keras_contrib/layers/convolutional.py b/keras_contrib/layers/convolutional.py
index c60df62..0899309 100644
--- a/keras_contrib/layers/convolutional.py
+++ b/keras_contrib/layers/convolutional.py
@@ -16,220 +16,6 @@ from keras.utils.conv_utils import normalize_data_format
 import numpy as np
 
 
-class Deconvolution3D(Convolution3D):
-    """Transposed convolution operator for filtering windows of 3-D inputs.
-
-    The need for transposed convolutions generally arises from the desire to
-    use a transformation going in the opposite direction
-    of a normal convolution, i.e., from something that has the shape
-    of the output of some convolution to something that has the shape
-    of its input while maintaining a connectivity pattern
-    that is compatible with said convolution.
-
-    When using this layer as the first layer in a model,
-    provide the keyword argument `input_shape`
-    (tuple of integers, does not include the sample axis),
-    e.g. `input_shape=(3, 128, 128, 128)` for a 128x128x128 volume with
-    three channels.
-
-    To pass the correct `output_shape` to this layer,
-    one could use a test model to predict and observe the actual output shape.
-
-    # Examples
-
-    ```python
-        # TH dim ordering.
-        # apply a 3x3x3 transposed convolution
-        # with stride 1x1x1 and 3 output filters on a 12x12x12 image:
-        model = Sequential()
-        model.add(Deconvolution3D(3, 3, 3, 3, output_shape=(None, 3, 14, 14, 14),
-                                  padding='valid',
-                                  input_shape=(3, 12, 12, 12)))
-
-        # we can predict with the model and print the shape of the array.
-        dummy_input = np.ones((32, 3, 12, 12, 12))
-        preds = model.predict(dummy_input)
-        print(preds.shape)  # (None, 3, 14, 14, 14)
-
-        # apply a 3x3x3 transposed convolution
-        # with stride 2x2x2 and 3 output filters on a 12x12x12 image:
-        model = Sequential()
-        model.add(Deconvolution3D(3, 3, 3, 3, output_shape=(None, 3, 25, 25, 25),
-                                  strides=(2, 2, 2),
-                                  padding='valid',
-                                  input_shape=(3, 12, 12, 12)))
-        model.summary()
-
-        # we can predict with the model and print the shape of the array.
-        dummy_input = np.ones((32, 3, 12, 12, 12))
-        preds = model.predict(dummy_input)
-        print(preds.shape)  # (None, 3, 25, 25, 25)
-    ```
-
-    ```python
-        # TF dim ordering.
-        # apply a 3x3x3 transposed convolution
-        # with stride 1x1x1 and 3 output filters on a 12x12x12 image:
-        model = Sequential()
-        model.add(Deconvolution3D(3, 3, 3, 3, output_shape=(None, 14, 14, 14, 3),
-                                  padding='valid',
-                                  input_shape=(12, 12, 12, 3)))
-
-        # we can predict with the model and print the shape of the array.
-        dummy_input = np.ones((32, 12, 12, 12, 3))
-        preds = model.predict(dummy_input)
-        print(preds.shape)  # (None, 14, 14, 14, 3)
-
-        # apply a 3x3x3 transposed convolution
-        # with stride 2x2x2 and 3 output filters on a 12x12x12 image:
-        model = Sequential()
-        model.add(Deconvolution3D(3, 3, 3, 3, output_shape=(None, 25, 25, 25, 3),
-                                  strides=(2, 2, 2),
-                                  padding='valid',
-                                  input_shape=(12, 12, 12, 3)))
-        model.summary()
-
-        # we can predict with the model and print the shape of the array.
-        dummy_input = np.ones((32, 12, 12, 12, 3))
-        preds = model.predict(dummy_input)
-        print(preds.shape)  # (None, 25, 25, 25, 3)
-    ```
-
-    # Arguments
-        filters: Number of transposed convolution filters to use.
-        kernel_size: kernel_size: An integer or tuple/list of 3 integers, specifying the
-            dimensions of the convolution window.
-        output_shape: Output shape of the transposed convolution operation.
-            tuple of integers
-            `(nb_samples, filters, conv_dim1, conv_dim2, conv_dim3)`.
-             It is better to use
-             a dummy input and observe the actual output shape of
-             a layer, as specified in the examples.
-        init: name of initialization function for the weights of the layer
-            (see [initializers](../initializers.md)), or alternatively,
-            Theano function to use for weights initialization.
-            This parameter is only relevant if you don't pass
-            a `weights` argument.
-        activation: name of activation function to use
-            (see [activations](../activations.md)),
-            or alternatively, elementwise Theano/TensorFlow function.
-            If you don't specify anything, no activation is applied
-            (ie. "linear" activation: a(x) = x).
-        weights: list of numpy arrays to set as initial weights.
-        padding: 'valid', 'same' or 'full'
-            ('full' requires the Theano backend).
-        strides: tuple of length 3. Factor by which to oversample output.
-            Also called strides elsewhere.
-        kernel_regularizer: instance of [WeightRegularizer](../regularizers.md)
-            (eg. L1 or L2 regularization), applied to the main weights matrix.
-        bias_regularizer: instance of [WeightRegularizer](../regularizers.md),
-            applied to the use_bias.
-        activity_regularizer: instance of [ActivityRegularizer](../regularizers.md),
-            applied to the network output.
-        kernel_constraint: instance of the [constraints](../constraints.md) module
-            (eg. maxnorm, nonneg), applied to the main weights matrix.
-        bias_constraint: instance of the [constraints](../constraints.md) module,
-            applied to the use_bias.
-        data_format: 'channels_first' or 'channels_last'. In 'channels_first' mode, the channels dimension
-            (the depth) is at index 1, in 'channels_last' mode is it at index 4.
-            It defaults to the `image_data_format` value found in your
-            Keras config file at `~/.keras/keras.json`.
-            If you never set it, then it will be "tf".
-        use_bias: whether to include a use_bias
-            (i.e. make the layer affine rather than linear).
-
-    # Input shape
-        5D tensor with shape:
-        `(samples, channels, conv_dim1, conv_dim2, conv_dim3)` if data_format='channels_first'
-        or 5D tensor with shape:
-        `(samples, conv_dim1, conv_dim2, conv_dim3, channels)` if data_format='channels_last'.
-
-    # Output shape
-        5D tensor with shape:
-        `(samples, filters, nekernel_conv_dim1, nekernel_conv_dim2, nekernel_conv_dim3)` if data_format='channels_first'
-        or 5D tensor with shape:
-        `(samples, nekernel_conv_dim1, nekernel_conv_dim2, nekernel_conv_dim3, filters)` if data_format='channels_last'.
-        `nekernel_conv_dim1`, `nekernel_conv_dim2` and `nekernel_conv_dim3` values might have changed due to padding.
-
-    # References
-        - [A guide to convolution arithmetic for deep learning](https://arxiv.org/abs/1603.07285v1)
-        - [Transposed convolution arithmetic](http://deeplearning.net/software/theano_versions/dev/tutorial/conv_arithmetic.html#transposed-convolution-arithmetic)
-        - [Deconvolutional Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
-    """
-
-    def __init__(self, filters, kernel_size,
-                 output_shape, activation=None, weights=None,
-                 padding='valid', strides=(1, 1, 1), data_format=None,
-                 kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None,
-                 kernel_constraint=None, bias_constraint=None,
-                 use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', **kwargs):
-        if padding not in {'valid', 'same', 'full'}:
-            raise ValueError('Invalid border mode for Deconvolution3D:', padding)
-        if len(output_shape) == 4:
-            # missing the batch size
-            output_shape = (None,) + tuple(output_shape)
-
-        self.output_shape_ = output_shape
-
-        super(Deconvolution3D, self).__init__(kernel_size=kernel_size,
-                                              filters=filters,
-                                              activation=activation,
-                                              weights=weights,
-                                              padding=padding,
-                                              strides=strides,
-                                              data_format=data_format,
-                                              kernel_regularizer=kernel_regularizer,
-                                              bias_regularizer=bias_regularizer,
-                                              activity_regularizer=activity_regularizer,
-                                              kernel_constraint=kernel_constraint,
-                                              bias_constraint=bias_constraint,
-                                              use_bias=use_bias,
-                                              kernel_initializer=kernel_initializer,
-                                              bias_initializer=bias_initializer,
-                                              **kwargs)
-
-    def compute_output_shape(self, input_shape):
-        if self.data_format == 'channels_first':
-            conv_dim1 = self.output_shape_[2]
-            conv_dim2 = self.output_shape_[3]
-            conv_dim3 = self.output_shape_[4]
-            return (input_shape[0], self.filters, conv_dim1, conv_dim2, conv_dim3)
-        elif self.data_format == 'channels_last':
-            conv_dim1 = self.output_shape_[1]
-            conv_dim2 = self.output_shape_[2]
-            conv_dim3 = self.output_shape_[3]
-            return (input_shape[0], conv_dim1, conv_dim2, conv_dim3, self.filters)
-        else:
-            raise ValueError('Invalid data format: ', self.data_format)
-
-    def call(self, x, mask=None):
-        kernel_shape = K.get_value(self.kernel).shape
-        output = K.deconv3d(x, self.kernel, self.output_shape_,
-                            strides=self.strides,
-                            padding=self.padding,
-                            data_format=self.data_format,
-                            filter_shape=kernel_shape)
-        if self.use_bias:
-            if self.data_format == 'channels_first':
-                output += K.reshape(self.bias, (1, self.filters, 1, 1, 1))
-            elif self.data_format == 'channels_last':
-                output += K.reshape(self.bias, (1, 1, 1, 1, self.filters))
-            else:
-                raise ValueError('Invalid data_format: ', self.data_format)
-        output = self.activation(output)
-        return output
-
-    def get_config(self):
-        config = {'output_shape': self.output_shape_}
-        base_config = super(Deconvolution3D, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
-
-Deconv3D = Deconvolution3D
-get_custom_objects().update({'Deconvolution3D': Deconvolution3D})
-get_custom_objects().update({'Deconv3D': Deconv3D})
-
-
 class CosineConvolution2D(Layer):
     """Cosine Normalized Convolution operator for filtering windows of two-dimensional inputs.
     Cosine Normalization: Using Cosine Similarity Instead of Dot Product in Neural Networks
diff --git a/keras_contrib/layers/normalization.py b/keras_contrib/layers/normalization.py
index 53b356b..40254cd 100644
--- a/keras_contrib/layers/normalization.py
+++ b/keras_contrib/layers/normalization.py
@@ -219,7 +219,7 @@ class BatchRenormalization(Layer):
         self.initial_weights = weights
         self.r_max_value = r_max_value
         self.d_max_value = d_max_value
-        self.t_delta = K.variable(np.array(t_delta))
+        self.t_delta = t_delta
         self.beta_initializer = initializers.get(beta_initializer)
         self.gamma_initializer = initializers.get(gamma_initializer)
         self.moving_mean_initializer = initializers.get(moving_mean_initializer)
@@ -266,11 +266,13 @@ class BatchRenormalization(Layer):
                                                 name='{}_running_std'.format(self.name),
                                                 trainable=False)
 
-        self.r_max = K.variable(np.ones((1,)), name='{}_r_max'.format(self.name))
+        self.r_max = K.variable(1, name='{}_r_max'.format(self.name))
 
-        self.d_max = K.variable(np.zeros((1,)), name='{}_d_max'.format(self.name))
+        self.d_max = K.variable(0, name='{}_d_max'.format(self.name))
 
-        self.t = K.variable(np.zeros((1,)), name='{}_t'.format(self.name))
+        self.t = K.variable(0, name='{}_t'.format(self.name))
+
+        self.t_delta_tensor = K.constant(self.t_delta)
 
         if self.initial_weights is not None:
             self.set_weights(self.initial_weights)
@@ -290,13 +292,11 @@ class BatchRenormalization(Layer):
         mean_batch, var_batch = K.moments(inputs, reduction_axes, shift=None, keep_dims=False)
         std_batch = (K.sqrt(var_batch + self.epsilon))
 
-        r_max_value = K.get_value(self.r_max)
         r = std_batch / (K.sqrt(self.running_variance + self.epsilon))
-        r = K.stop_gradient(K.clip(r, 1 / r_max_value, r_max_value))
+        r = K.stop_gradient(K.clip(r, 1 / self.r_max, self.r_max))
 
-        d_max_value = K.get_value(self.d_max)
         d = (mean_batch - self.running_mean) / K.sqrt(self.running_variance + self.epsilon)
-        d = K.stop_gradient(K.clip(d, -d_max_value, d_max_value))
+        d = K.stop_gradient(K.clip(d, -self.d_max, self.d_max))
 
         if sorted(reduction_axes) == range(K.ndim(inputs))[:-1]:
             x_normed_batch = (inputs - mean_batch) / std_batch
@@ -323,7 +323,7 @@ class BatchRenormalization(Layer):
 
         self.add_update([K.update(self.r_max, r_val),
                          K.update(self.d_max, d_val),
-                         K.update_add(self.t, self.t_delta)], x)
+                         K.update_add(self.t, self.t_delta_tensor)], inputs)
 
         if training in {0, False}:
             return x_normed
@@ -358,13 +358,15 @@ class BatchRenormalization(Layer):
     def get_config(self):
         config = {'epsilon': self.epsilon,
                   'axis': self.axis,
+                  'center': self.center,
+                  'scale': self.scale,
+                  'momentum': self.momentum,
                   'gamma_regularizer': initializers.serialize(self.gamma_regularizer),
                   'beta_regularizer': initializers.serialize(self.beta_regularizer),
                   'moving_mean_initializer': initializers.serialize(self.moving_mean_initializer),
                   'moving_variance_initializer': initializers.serialize(self.moving_variance_initializer),
                   'beta_constraint': constraints.serialize(self.beta_constraint),
                   'gamma_constraint': constraints.serialize(self.gamma_constraint),
-                  'momentum': self.momentum,
                   'r_max_value': self.r_max_value,
                   'd_max_value': self.d_max_value,
                   't_delta': self.t_delta}
diff --git a/keras_contrib/layers/recurrent.py b/keras_contrib/layers/recurrent.py
index e85dc22..c85a6c6 100644
--- a/keras_contrib/layers/recurrent.py
+++ b/keras_contrib/layers/recurrent.py
@@ -8,5 +8,3 @@ from .. import initializers
 from .. import regularizers
 from keras.engine import Layer
 from keras.engine import InputSpec
-
-from keras.layers.recurrent import _time_distributed_dense
diff --git a/keras_contrib/optimizers/ftml.py b/keras_contrib/optimizers/ftml.py
index edfb9d7..7545934 100644
--- a/keras_contrib/optimizers/ftml.py
+++ b/keras_contrib/optimizers/ftml.py
@@ -2,7 +2,6 @@ from __future__ import absolute_import
 from keras.optimizers import Optimizer
 from .. import backend as K
 from keras.utils.generic_utils import get_custom_objects
-from keras.legacy import interfaces
 
 
 class FTML(Optimizer):
@@ -31,7 +30,6 @@ class FTML(Optimizer):
         self.epsilon = epsilon
         self.inital_decay = decay
 
-    @interfaces.legacy_get_updates_support
     def get_updates(self, loss, params):
         grads = self.get_gradients(loss, params)
         self.updates = [K.update_add(self.iterations, 1)]
diff --git a/setup.py b/setup.py
index aad0567..3c17537 100644
--- a/setup.py
+++ b/setup.py
@@ -3,11 +3,32 @@ from setuptools import find_packages
 
 
 setup(name='keras_contrib',
-      version='1.2.1',
-      description='Keras community contributions',
+      version='2.0.8',
+      description='Keras Deep Learning for Python, Community Contributions',
       author='Fariz Rahman',
       author_email='farizrahman4u@gmail.com',
       url='https://github.com/farizrahman4u/keras-contrib',
       license='MIT',
       install_requires=['keras'],
+      extras_require={
+          'h5py': ['h5py'],
+          'visualize': ['pydot>=1.2.0'],
+          'tests': ['pytest',
+                    'pytest-pep8',
+                    'pytest-xdist',
+                    'pytest-cov'],
+      },
+      classifiers=[
+          'Development Status :: 3 - Alpha',
+          'Intended Audience :: Developers',
+          'Intended Audience :: Education',
+          'Intended Audience :: Science/Research',
+          'License :: OSI Approved :: MIT License',
+          'Programming Language :: Python :: 2',
+          'Programming Language :: Python :: 2.7',
+          'Programming Language :: Python :: 3',
+          'Programming Language :: Python :: 3.6',
+          'Topic :: Software Development :: Libraries',
+          'Topic :: Software Development :: Libraries :: Python Modules'
+      ],
       packages=find_packages())
diff --git a/tests/keras_contrib/backend/backend_test.py b/tests/keras_contrib/backend/backend_test.py
index 64135b0..c3829e3 100644
--- a/tests/keras_contrib/backend/backend_test.py
+++ b/tests/keras_contrib/backend/backend_test.py
@@ -1,7 +1,6 @@
 import pytest
 from numpy.testing import assert_allclose
 import numpy as np
-import scipy.sparse as sparse
 
 from keras import backend as K
 from keras.backend import theano_backend as KTH, floatx, set_floatx, variable
@@ -157,8 +156,46 @@ class TestBackend(object):
                     th_var_val = KTH.eval(th_var)
                     tf_var_val = KTF.eval(tf_var)
 
-                    assert_allclose(th_mean_val, tf_mean_val, rtol=1e-4)
-                    assert_allclose(th_var_val, tf_var_val, rtol=1e-4)
+                    # absolute tolerance needed when working with zeros
+                    assert_allclose(th_mean_val, tf_mean_val, rtol=1e-4, atol=1e-10)
+                    assert_allclose(th_var_val, tf_var_val, rtol=1e-4, atol=1e-10)
+
+    def test_clip(self):
+        check_single_tensor_operation('clip', (4, 2), min_value=0.4, max_value=0.6)
+        check_single_tensor_operation('clip', (4, 2), min_value=0.4, max_value=None)
+
+        cases = [
+            # (x, min_value, max_value, expected)
+            (1, 0, 2, 1),
+            (1, 2, 0, 2),
+            (-1, 0, 2, 0),
+            (-1, 2, 0, 2),
+            (3, 0, 2, 2),
+            (3, 2, 0, 2),
+            (1, 0, np.inf, 1),
+            (1, np.inf, 0, np.inf),
+            (1, 0, -np.inf, 0),
+            (1, -np.inf, 0, 0),
+            (-1, 0, -np.inf, 0),
+            (-1, -np.inf, 0, -1),
+            (1, 0, None, 1),
+            (-1, 0, None, 0),
+
+            # NOTE: In the following two cases, Keras 2.0.8 raises an
+            # error on all backends, but this is a sensible extension.
+            (1, None, 0, 0),
+            (-1, None, 0, -1),
+
+            # NOTE: In the following case, Keras 2.0.8 rasies an error
+            # for TensorFlow and Theano, but returns 0 for CNTK. This
+            # extends the TensorFlow and Theano backends to match the
+            # CNTK behavior instead of raising an error.
+            (0, None, None, 0),
+        ]
+        for K_, KC_ in [(KTF, KCTF), (KTH, KCTH)]:
+            for x, min_value, max_value, expected in cases:
+                actual = K_.eval(KC_.clip(K_.constant(x), min_value, max_value))
+                assert_allclose(expected, actual, atol=1e-5)
 
 
 if __name__ == '__main__':
diff --git a/tests/keras_contrib/callbacks/dead_relu_detector_test.py b/tests/keras_contrib/callbacks/dead_relu_detector_test.py
index 9a37df9..5f7c396 100644
--- a/tests/keras_contrib/callbacks/dead_relu_detector_test.py
+++ b/tests/keras_contrib/callbacks/dead_relu_detector_test.py
@@ -1,40 +1,191 @@
 import pytest
-import warnings
 import numpy as np
+import sys
+
+if (sys.version_info > (3, 0)):
+    from io import StringIO
+else:
+    from StringIO import StringIO
 
 from keras_contrib import callbacks
 from keras.models import Sequential
-from keras.layers import Dense
+from keras.layers import Dense, Conv2D, Flatten
+from keras import backend as K
+
+n_out = 11  # with 1 neuron dead, 1/11 is just below the threshold of 10% with verbose = False
+
+
+def check_print(do_train, expected_warnings, nr_dead=None, perc_dead=None):
+    """
+    Receive stdout to check if correct warning message is delivered
+    :param nr_dead: int
+    :param perc_dead: float, 10% should be written as 0.1
+    """
+
+    saved_stdout = sys.stdout
+
+    out = StringIO()
+    out.flush()
+    sys.stdout = out    # overwrite current stdout
+
+    do_train()
+
+    stdoutput = out.getvalue().strip()  # get prints, can be something like: "Layer dense (#0) has 2 dead neurons (20.00%)!"
+    str_to_count = "dead neurons"
+    count = stdoutput.count(str_to_count)
+
+    sys.stdout = saved_stdout   # restore stdout
+    out.close()
+
+    assert expected_warnings == count
+    if expected_warnings and (nr_dead is not None):
+        str_to_check = 'has {} dead'.format(nr_dead)
+        assert str_to_check in stdoutput, '"{}" not in "{}"'.format(str_to_check, stdoutput)
+    if expected_warnings and (perc_dead is not None):
+        str_to_check = 'neurons ({:.2%})!'.format(perc_dead)
+        assert str_to_check in stdoutput, '"{}" not in "{}"'.format(str_to_check, stdoutput)
 
 
 def test_DeadDeadReluDetector():
-    def do_test(weights, expected_warnings, verbose):
-        with warnings.catch_warnings(record=True) as w:
-            dataset = np.ones((1, 1, 1))    # data to be fed as training
+    n_samples = 9
+
+    input_shape = (n_samples, 3, 4)  # 4 input features
+    shape_out = (n_samples, 3, n_out)  # 11 output features
+    shape_weights = (4, n_out)
+
+    # ignore batch size
+    input_shape_dense = tuple(input_shape[1:])
+
+    def do_test(weights, expected_warnings, verbose, nr_dead=None, perc_dead=None):
+
+        def do_train():
+            dataset = np.ones(input_shape)    # data to be fed as training
             model = Sequential()
-            model.add(Dense(10, activation='relu', input_shape=(1, 1), use_bias=False, weights=[weights]))
+            model.add(Dense(n_out, activation='relu', input_shape=input_shape_dense,
+                            use_bias=False, weights=[weights], name='dense'))
             model.compile(optimizer='sgd', loss='categorical_crossentropy')
             model.fit(
                 dataset,
-                np.ones((1, 1, 10)),
+                np.ones(shape_out),
+                batch_size=1,
                 epochs=1,
                 callbacks=[callbacks.DeadReluDetector(dataset, verbose=verbose)],
                 verbose=False
             )
-            assert len(w) == expected_warnings
-            for warn_item in w:
-                assert issubclass(warn_item.category, RuntimeWarning)
-                assert "dead neurons" in str(warn_item.message)
 
-    weights_1_dead = np.ones((1, 10))      # weights that correspond to NN with 1/10 neurons dead
+        check_print(do_train, expected_warnings, nr_dead, perc_dead)
+
+    weights_1_dead = np.ones(shape_weights)  # weights that correspond to NN with 1/11 neurons dead
+    weights_2_dead = np.ones(shape_weights)  # weights that correspond to NN with 2/11 neurons dead
+    weights_all_dead = np.zeros(shape_weights)  # weights that correspond to all neurons dead
+
     weights_1_dead[:, 0] = 0
-    weights_2_dead = np.ones((1, 10))      # weights that correspond to NN with 2/10 neurons dead
-    weights_2_dead[:, 0] = 0
-    weights_2_dead[:, 1] = 0
+    weights_2_dead[:, 0:2] = 0
 
-    do_test(weights_1_dead, verbose=True, expected_warnings=1)
+    do_test(weights_1_dead, verbose=True, expected_warnings=1, nr_dead=1, perc_dead=1. / n_out)
     do_test(weights_1_dead, verbose=False, expected_warnings=0)
-    do_test(weights_2_dead, verbose=True, expected_warnings=1)
+    do_test(weights_2_dead, verbose=True, expected_warnings=1, nr_dead=2, perc_dead=2. / n_out)
+    # do_test(weights_all_dead, verbose=True, expected_warnings=1, nr_dead=n_out, perc_dead=1.)
+
+
+def test_DeadDeadReluDetector_bias():
+    n_samples = 9
+
+    input_shape = (n_samples, 4)  # 4 input features
+    shape_weights = (4, n_out)
+    shape_bias = (n_out, )
+    shape_out = (n_samples, n_out)  # 11 output features
+
+    # ignore batch size
+    input_shape_dense = tuple(input_shape[1:])
+
+    def do_test(weights, bias, expected_warnings, verbose, nr_dead=None, perc_dead=None):
+
+        def do_train():
+            dataset = np.ones(input_shape)  # data to be fed as training
+            model = Sequential()
+            model.add(Dense(n_out, activation='relu', input_shape=input_shape_dense,
+                            use_bias=True, weights=[weights, bias], name='dense'))
+            model.compile(optimizer='sgd', loss='categorical_crossentropy')
+            model.fit(
+                dataset,
+                np.ones(shape_out),
+                batch_size=1,
+                epochs=1,
+                callbacks=[callbacks.DeadReluDetector(dataset, verbose=verbose)],
+                verbose=False
+            )
+
+        check_print(do_train, expected_warnings, nr_dead, perc_dead)
+
+    weights_1_dead = np.ones(shape_weights)  # weights that correspond to NN with 1/11 neurons dead
+    weights_2_dead = np.ones(shape_weights)  # weights that correspond to NN with 2/11 neurons dead
+    weights_all_dead = np.zeros(shape_weights)  # weights that correspond to all neurons dead
+
+    weights_1_dead[:, 0] = 0
+    weights_2_dead[:, 0:2] = 0
+
+    bias = np.zeros(shape_bias)
+
+    do_test(weights_1_dead, bias, verbose=True, expected_warnings=1, nr_dead=1, perc_dead=1. / n_out)
+    do_test(weights_1_dead, bias, verbose=False, expected_warnings=0)
+    do_test(weights_2_dead, bias, verbose=True, expected_warnings=1, nr_dead=2, perc_dead=2. / n_out)
+    # do_test(weights_all_dead, bias, verbose=True, expected_warnings=1, nr_dead=n_out, perc_dead=1.)
+
+
+def test_DeadDeadReluDetector_conv():
+    n_samples = 9
+
+    # (5, 5) kernel, 4 input featuremaps and 11 output featuremaps
+    if K.image_data_format() == 'channels_last':
+        input_shape = (n_samples, 5, 5, 4)
+    else:
+        input_shape = (n_samples, 4, 5, 5)
+
+    # ignore batch size
+    input_shape_conv = tuple(input_shape[1:])
+    shape_weights = (5, 5, 4, n_out)
+    shape_out = (n_samples, n_out)
+
+    def do_test(weights_bias, expected_warnings, verbose, nr_dead=None, perc_dead=None):
+        """
+        :param perc_dead: as float, 10% should be written as 0.1
+        """
+
+        def do_train():
+            dataset = np.ones(input_shape)  # data to be fed as training
+            model = Sequential()
+            model.add(Conv2D(n_out, (5, 5), activation='relu', input_shape=input_shape_conv,
+                             use_bias=True, weights=weights_bias, name='conv'))
+            model.add(Flatten())  # to handle Theano's categorical crossentropy
+            model.compile(optimizer='sgd', loss='categorical_crossentropy')
+            model.fit(
+                dataset,
+                np.ones(shape_out),
+                batch_size=1,
+                epochs=1,
+                callbacks=[callbacks.DeadReluDetector(dataset, verbose=verbose)],
+                verbose=False
+            )
+
+        check_print(do_train, expected_warnings, nr_dead, perc_dead)
+
+    weights_1_dead = np.ones(shape_weights)      # weights that correspond to NN with 1/11 neurons dead
+    weights_1_dead[..., 0] = 0
+    weights_2_dead = np.ones(shape_weights)    # weights that correspond to NN with 2/11 neurons dead
+    weights_2_dead[..., 0:2] = 0
+    weights_all_dead = np.zeros(shape_weights)    # weights that correspond to NN with all neurons dead
+
+    bias = np.zeros((11, ))
+
+    weights_bias_1_dead = [weights_1_dead, bias]
+    weights_bias_2_dead = [weights_2_dead, bias]
+    weights_bias_all_dead = [weights_all_dead, bias]
+
+    do_test(weights_bias_1_dead, verbose=True, expected_warnings=1, nr_dead=1, perc_dead=1. / n_out)
+    do_test(weights_bias_1_dead, verbose=False, expected_warnings=0)
+    do_test(weights_bias_2_dead, verbose=True, expected_warnings=1, nr_dead=2, perc_dead=2. / n_out)
+    # do_test(weights_bias_all_dead, verbose=True, expected_warnings=1, nr_dead=n_out, perc_dead=1.)
 
 
 if __name__ == '__main__':
diff --git a/tests/keras_contrib/layers/test_advanced_activations.py b/tests/keras_contrib/layers/test_advanced_activations.py
index f07a690..8c71426 100644
--- a/tests/keras_contrib/layers/test_advanced_activations.py
+++ b/tests/keras_contrib/layers/test_advanced_activations.py
@@ -26,5 +26,18 @@ def test_srelu_share():
     layer_test(advanced_activations.SReLU, kwargs={'shared_axes': 1},
                input_shape=(2, 3, 4))
 
+
+@keras_test
+def test_swish_constant():
+    layer_test(advanced_activations.Swish, kwargs={'beta': 1.0, 'trainable': False},
+               input_shape=(2, 3, 4))
+
+
+@keras_test
+def test_swish_trainable():
+    layer_test(advanced_activations.Swish, kwargs={'beta': 1.0, 'trainable': True},
+               input_shape=(2, 3, 4))
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras_contrib/layers/test_convolutional.py b/tests/keras_contrib/layers/test_convolutional.py
index 1760226..d207656 100644
--- a/tests/keras_contrib/layers/test_convolutional.py
+++ b/tests/keras_contrib/layers/test_convolutional.py
@@ -17,67 +17,6 @@ else:
     _convolution_border_modes = ['valid', 'same']
 
 
-@keras_test
-def test_deconvolution_3d():
-    num_samples = 6
-    num_filter = 4
-    stack_size = 2
-    kernel_dim1 = 12
-    kernel_dim2 = 10
-    kernel_dim3 = 8
-
-    for batch_size in [None, num_samples]:
-        for border_mode in _convolution_border_modes:
-            for subsample in [(1, 1, 1), (2, 2, 2)]:
-                if border_mode == 'same' and subsample != (1, 1, 1):
-                    continue
-
-                dim1 = conv_input_length(kernel_dim1, 7,
-                                         border_mode,
-                                         subsample[0])
-                dim2 = conv_input_length(kernel_dim2, 5,
-                                         border_mode,
-                                         subsample[1])
-                dim3 = conv_input_length(kernel_dim3, 3,
-                                         border_mode,
-                                         subsample[2])
-                layer_test(convolutional.Deconvolution3D,
-                           kwargs={'filters': num_filter,
-                                   'kernel_size': (7, 5, 3),
-                                   'output_shape': (batch_size, num_filter, dim1, dim2, dim3),
-                                   'padding': border_mode,
-                                   'strides': subsample,
-                                   'data_format': 'channels_first'},
-                           input_shape=(num_samples, stack_size, kernel_dim1, kernel_dim2, kernel_dim3),
-
-                           fixed_batch_size=True, tolerance=None)
-
-                layer_test(convolutional.Deconvolution3D,
-                           kwargs={'filters': num_filter,
-                                   'kernel_size': (7, 5, 3),
-                                   'output_shape': (batch_size, num_filter, dim1, dim2, dim3),
-                                   'padding': border_mode,
-                                   'strides': subsample,
-                                   'data_format': 'channels_first',
-                                   'kernel_regularizer': 'l2',
-                                   'bias_regularizer': 'l2',
-                                   'activity_regularizer': 'l2'},
-                           input_shape=(num_samples, stack_size, kernel_dim1, kernel_dim2, kernel_dim3),
-                           fixed_batch_size=True, tolerance=None)
-
-                layer_test(convolutional.Deconvolution3D,
-                           kwargs={'filters': num_filter,
-                                   'kernel_size': (7, 5, 3),
-                                   'output_shape': (num_filter, dim1, dim2, dim3),
-                                   'padding': border_mode,
-                                   'strides': subsample,
-                                   'data_format': 'channels_first',
-                                   'kernel_regularizer': 'l2',
-                                   'bias_regularizer': 'l2',
-                                   'activity_regularizer': 'l2'},
-                           input_shape=(num_samples, stack_size, kernel_dim1, kernel_dim2, kernel_dim3), tolerance=None)
-
-
 @keras_test
 def test_cosineconvolution_2d():
     num_samples = 2
diff --git a/tests/keras_contrib/layers/test_normalization.py b/tests/keras_contrib/layers/test_normalization.py
index 3321d09..1c74a04 100644
--- a/tests/keras_contrib/layers/test_normalization.py
+++ b/tests/keras_contrib/layers/test_normalization.py
@@ -25,9 +25,7 @@ def basic_instancenorm_test():
                input_shape=(3, 4, 2))
     layer_test(normalization.InstanceNormalization,
                kwargs={'gamma_initializer': 'ones',
-                       'beta_initializer': 'ones',
-                       'moving_mean_initializer': 'zeros',
-                       'moving_variance_initializer': 'ones'},
+                       'beta_initializer': 'ones'},
                input_shape=(3, 4, 2))
     layer_test(normalization.InstanceNormalization,
                kwargs={'scale': False, 'center': False},
@@ -190,7 +188,7 @@ def test_instancenorm_perchannel_correctness():
         for channel in range(3):
             activations = out[instance, channel]
             assert abs(activations.mean()) > 1e-2
-            assert abs(activations.std() - 1.0) > 1e-2
+            assert abs(activations.std() - 1.0) > 1e-6
 
         # but values are still normalized per-instance
         activations = out[instance]
@@ -229,10 +227,11 @@ def basic_batchrenorm_test():
 
 @keras_test
 def test_batchrenorm_mode_0_or_2():
-    for training in [1, 0]:
-        model = Sequential()
-        norm_m0 = normalization.BatchRenormalization(input_shape=(10,), momentum=0.8)
-        model.add(norm_m0)
+    for training in [1, 0, None]:
+        ip = Input(shape=(10,))
+        norm_m0 = normalization.BatchRenormalization(momentum=0.8)
+        out = norm_m0(ip, training=training)
+        model = Model(ip, out)
         model.compile(loss='mse', optimizer='sgd')
 
         # centered on 5.0, variance 10.0
@@ -306,5 +305,37 @@ def test_shared_batchrenorm():
     new_model.train_on_batch(x, x)
 
 
+@keras_test
+def test_batchrenorm_clipping_schedule():
+    '''Test that the clipping schedule isn't fixed at r_max=1, d_max=0'''
+    inp = Input(shape=(10,))
+    bn = normalization.BatchRenormalization(t_delta=1.)
+    out = bn(inp)
+    model = Model(inp, out)
+    model.compile('sgd', 'mse')
+
+    x = np.random.normal(5, 10, size=(2, 10))
+    y = np.random.normal(5, 10, size=(2, 10))
+
+    r_max, d_max = K.get_value(bn.r_max), K.get_value(bn.d_max)
+    assert r_max == 1
+    assert d_max == 0
+
+    for i in range(10):
+        model.train_on_batch(x, y)
+
+    r_max, d_max = K.get_value(bn.r_max), K.get_value(bn.d_max)
+    assert_allclose([r_max, d_max], [3, 5], atol=1e-1)
+
+
+@keras_test
+def test_batchrenorm_get_config():
+    '''Test that get_config works on a model with a batchrenorm layer.'''
+    x = Input(shape=(10,))
+    y = normalization.BatchRenormalization()(x)
+    model = Model(x, y)
+    model.get_config()
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras_contrib/utils/save_load_utils_test.py b/tests/keras_contrib/utils/save_load_utils_test.py
index 67f55fc..a11e826 100644
--- a/tests/keras_contrib/utils/save_load_utils_test.py
+++ b/tests/keras_contrib/utils/save_load_utils_test.py
@@ -1,12 +1,16 @@
 import pytest
+import os
 from keras import backend as K
 from keras.layers import Input, Dense
 from keras.models import Model
 from numpy.testing import assert_allclose
+from keras.utils.test_utils import keras_test
 
 from keras_contrib.utils.save_load_utils import save_all_weights, load_all_weights
 
 
+@pytest.mark.skipif(K.backend() != 'tensorflow', reason='save_all_weights and load_all_weights only supported on TensorFlow')
+@keras_test
 def test_save_and_load_all_weights():
     '''
     Test save_all_weights and load_all_weights. Save and load optimizer and model weights but not configuration.
@@ -33,15 +37,16 @@ def test_save_and_load_all_weights():
     ow1value[0, 0:3] = [4, 2, 0]
     K.set_value(ow1, ow1value)
     # save all weights
-    save_all_weights(m1, "model.h5")
+    save_all_weights(m1, 'model.h5')
     # new model
     m2 = make_model()
     # load all weights
-    load_all_weights(m2, "model.h5")
+    load_all_weights(m2, 'model.h5')
     # check weights
     assert_allclose(K.get_value(m2.layers[1].kernel)[0, 0:4], [1, 3, 3, 7])
     # check optimizer weights
     assert_allclose(K.get_value(m2.optimizer.weights[3])[0, 0:3], [4, 2, 0])
+    os.remove('model.h5')
 
 
 if __name__ == '__main__':