From 7bae1db1f25d3b6c3066abe33f9433e18c58f5f7 Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Wed, 6 Sep 2017 13:01:50 -0500
Subject: [PATCH] Update DenseNet with ImageNet weights and major code cleanup

---
 keras_contrib/applications/densenet.py | 777 ++++++++++++++++++-------
 1 file changed, 552 insertions(+), 225 deletions(-)

diff --git a/keras_contrib/applications/densenet.py b/keras_contrib/applications/densenet.py
index a7c4711..2cf4020 100644
--- a/keras_contrib/applications/densenet.py
+++ b/keras_contrib/applications/densenet.py
@@ -1,5 +1,38 @@
 # -*- coding: utf-8 -*-
-'''DenseNet models for Keras.
+'''DenseNet and DenseNet-FCN models for Keras.
+
+DenseNet is a network architecture where each layer is directly connected
+to every other layer in a feed-forward fashion (within each dense block).
+For each layer, the feature maps of all preceding layers are treated as
+separate inputs whereas its own feature maps are passed on as inputs to
+all subsequent layers. This connectivity pattern yields state-of-the-art
+accuracies on CIFAR10/100 (with or without data augmentation) and SVHN.
+On the large scale ILSVRC 2012 (ImageNet) dataset, DenseNet achieves a
+similar accuracy as ResNet, but using less than half the amount of
+parameters and roughly half the number of FLOPs.
+
+DenseNets support any input image size of 32x32 or greater, and are thus
+suited for CIFAR-10 or CIFAR-100 datasets. There are two types of DenseNets,
+one suited for smaller images (DenseNet) and one suited for ImageNet,
+called DenseNetImageNet. They are differentiated by the strided convolution
+and pooling operations prior to the initial dense block.
+
+The following table describes the size and accuracy of DenseNetImageNet models
+on the ImageNet dataset (single crop), for which weights are provided:
+------------------------------------------------------------------------------------
+    Model type      | ImageNet Acc (Top 1)  |  ImageNet Acc (Top 5) |  Params (M)  |
+------------------------------------------------------------------------------------
+|   DenseNet-121    |    25.02 %            |        7.71 %         |     8.0      |
+|   DenseNet-169    |    23.80 %            |        6.85 %         |     14.3     |
+|   DenseNet-201    |    22.58 %            |        6.34 %         |     20.2     |
+|   DenseNet-161    |    22.20 %            |         -   %         |     28.9     |
+------------------------------------------------------------------------------------
+
+DenseNets can be extended to image segmentation tasks as described in the
+paper "The One Hundred Layers Tiramisu: Fully Convolutional DenseNets for
+Semantic Segmentation". Here, the dense blocks are arranged and concatenated
+with long skip connections for state of the art performance on CamVid dataset.
+
 # Reference
 - [Densely Connected Convolutional Networks](https://arxiv.org/pdf/1608.06993.pdf)
 - [The One Hundred Layers Tiramisu: Fully Convolutional DenseNets for Semantic Segmentation](https://arxiv.org/pdf/1611.09326.pdf)
@@ -11,89 +44,169 @@ from __future__ import division
 import warnings
 
 from keras.models import Model
-from keras.layers.core import Dense, Dropout, Activation, Reshape
-from keras.layers.convolutional import Conv2D, Conv2DTranspose, UpSampling2D
-from keras.layers.pooling import AveragePooling2D
-from keras.layers.pooling import GlobalAveragePooling2D
+from keras.layers import Dense
+from keras.layers import Dropout
+from keras.layers import Activation
+from keras.layers import Reshape
+from keras.layers import Conv2D
+from keras.layers import Conv2DTranspose
+from keras.layers import UpSampling2D
+from keras.layers import MaxPooling2D
+from keras.layers import AveragePooling2D
+from keras.layers import GlobalMaxPooling2D
+from keras.layers import GlobalAveragePooling2D
 from keras.layers import Input
-from keras.layers.merge import concatenate
-from keras.layers.normalization import BatchNormalization
+from keras.layers import concatenate
+from keras.layers import BatchNormalization
 from keras.regularizers import l2
 from keras.utils.layer_utils import convert_all_kernels_in_model
 from keras.utils.data_utils import get_file
 from keras.engine.topology import get_source_inputs
 from keras.applications.imagenet_utils import _obtain_input_shape
+from keras.applications.imagenet_utils import decode_predictions
 import keras.backend as K
 
 from keras_contrib.layers.convolutional import SubPixelUpscaling
 
-TH_WEIGHTS_PATH = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Theano-Backend-TH-dim-ordering.h5'
-TF_WEIGHTS_PATH = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Tensorflow-Backend-TF-dim-ordering.h5'
-TH_WEIGHTS_PATH_NO_TOP = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Theano-Backend-TH-dim-ordering-no-top.h5'
-TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Tensorflow-Backend-TF-dim-ordering-no-top.h5'
+DENSENET_121_WEIGHTS_PATH = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-121-32.h5'
+DENSENET_161_WEIGHTS_PATH = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-161-48.h5'
+DENSENET_169_WEIGHTS_PATH = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-169-32.h5'
+DENSENET_121_WEIGHTS_PATH_NO_TOP = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-121-32-no-top.h5'
+DENSENET_161_WEIGHTS_PATH_NO_TOP = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-161-48-no-top.h5'
+DENSENET_169_WEIGHTS_PATH_NO_TOP = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-169-32-no-top.h5'
 
 
-def DenseNet(input_shape=None, depth=40, nb_dense_block=3, growth_rate=12, nb_filter=16, nb_layers_per_block=-1,
-             bottleneck=False, reduction=0.0, dropout_rate=0.0, weight_decay=1E-4,
-             include_top=True, weights='cifar10', input_tensor=None,
-             classes=10, activation='softmax'):
-    '''Instantiate the DenseNet architecture,
-        optionally loading weights pre-trained
-        on CIFAR-10. Note that when using TensorFlow,
-        for best performance you should set
-        `image_data_format='channels_last'` in your Keras config
-        at ~/.keras/keras.json.
-        The model and the weights are compatible with both
-        TensorFlow and Theano. The dimension ordering
-        convention used by the model is the one
-        specified in your Keras config file.
-        # Arguments
-            input_shape: optional shape tuple, only to be specified
-                if `include_top` is False (otherwise the input shape
-                has to be `(32, 32, 3)` (with `channels_last` dim ordering)
-                or `(3, 32, 32)` (with `channels_first` dim ordering).
-                It should have exactly 3 inputs channels,
-                and width and height should be no smaller than 8.
-                E.g. `(200, 200, 3)` would be one valid value.
-            depth: number or layers in the DenseNet
-            nb_dense_block: number of dense blocks to add to end (generally = 3)
-            growth_rate: number of filters to add per dense block
-            nb_filter: initial number of filters. -1 indicates initial
-                number of filters is 2 * growth_rate
-            nb_layers_per_block: number of layers in each dense block.
-                Can be a -1, positive integer or a list.
-                If -1, calculates nb_layer_per_block from the network depth.
-                If positive integer, a set number of layers per dense block.
-                If list, nb_layer is used as provided. Note that list size must
-                be (nb_dense_block + 1)
-            bottleneck: flag to add bottleneck blocks in between dense blocks
-            reduction: reduction factor of transition blocks.
-                Note : reduction value is inverted to compute compression.
-            dropout_rate: dropout rate
-            weight_decay: weight decay factor
-            include_top: whether to include the fully-connected
-                layer at the top of the network.
-            weights: one of `None` (random initialization) or
-                'cifar10' (pre-training on CIFAR-10)..
-            input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
-                to use as image input for the model.
-            classes: optional number of classes to classify images
-                into, only to be specified if `include_top` is True, and
-                if no `weights` argument is specified.
-            activation: Type of activation at the top layer. Can be one of 'softmax' or 'sigmoid'.
-                Note that if sigmoid is used, classes must be 1.
-        # Returns
-            A Keras model instance.
-        '''
+def preprocess_input(x, data_format=None):
+    """Preprocesses a tensor encoding a batch of images.
 
-    if weights not in {'cifar10', None}:
+    # Arguments
+        x: input Numpy tensor, 4D.
+        data_format: data format of the image tensor.
+
+    # Returns
+        Preprocessed tensor.
+    """
+    if data_format is None:
+        data_format = K.image_data_format()
+    assert data_format in {'channels_last', 'channels_first'}
+
+    if data_format == 'channels_first':
+        if x.ndim == 3:
+            # 'RGB'->'BGR'
+            x = x[::-1, ...]
+            # Zero-center by mean pixel
+            x[0, :, :] -= 103.939
+            x[1, :, :] -= 116.779
+            x[2, :, :] -= 123.68
+        else:
+            x = x[:, ::-1, ...]
+            x[:, 0, :, :] -= 103.939
+            x[:, 1, :, :] -= 116.779
+            x[:, 2, :, :] -= 123.68
+    else:
+        # 'RGB'->'BGR'
+        x = x[..., ::-1]
+        # Zero-center by mean pixel
+        x[..., 0] -= 103.939
+        x[..., 1] -= 116.779
+        x[..., 2] -= 123.68
+
+    x *= 0.017  # scale values
+
+    return x
+
+
+def DenseNet(input_shape=None,
+             depth=40,
+             nb_dense_block=3,
+             growth_rate=12,
+             nb_filter=-1,
+             nb_layers_per_block=-1,
+             bottleneck=False,
+             reduction=0.0,
+             dropout_rate=0.0,
+             weight_decay=1e-4,
+             subsample_initial_block=False,
+             include_top=True,
+             weights=None,
+             input_tensor=None,
+             pooling=None,
+             classes=10,
+             activation='softmax'):
+    '''Instantiate the DenseNet architecture.
+
+    The model and the weights are compatible with both
+    TensorFlow and Theano. The dimension ordering
+    convention used by the model is the one
+    specified in your Keras config file.
+
+    # Arguments
+        input_shape: optional shape tuple, only to be specified
+            if `include_top` is False (otherwise the input shape
+            has to be `(224, 224, 3)` (with `channels_last` dim ordering)
+            or `(3, 224, 224)` (with `channels_first` dim ordering).
+            It should have exactly 3 inputs channels,
+            and width and height should be no smaller than 8.
+            E.g. `(224, 224, 3)` would be one valid value.
+        depth: number or layers in the DenseNet
+        nb_dense_block: number of dense blocks to add to end
+        growth_rate: number of filters to add per dense block
+        nb_filter: initial number of filters. -1 indicates initial
+            number of filters will default to 2 * growth_rate
+        nb_layers_per_block: number of layers in each dense block.
+            Can be a -1, positive integer or a list.
+            If -1, calculates nb_layer_per_block from the network depth.
+            If positive integer, a set number of layers per dense block.
+            If list, nb_layer is used as provided. Note that list size must
+            be nb_dense_block
+        bottleneck: flag to add bottleneck blocks in between dense blocks
+        reduction: reduction factor of transition blocks.
+            Note : reduction value is inverted to compute compression.
+        dropout_rate: dropout rate
+        weight_decay: weight decay rate
+        subsample_initial_block: Set to True to subsample the initial
+            convolution and add a MaxPool2D before the dense blocks are added.
+        include_top: whether to include the fully-connected
+            layer at the top of the network.
+        weights: one of `None` (random initialization) or
+            'imagenet' (pre-training on ImageNet)..
+        input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+            to use as image input for the model.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
+        classes: optional number of classes to classify images
+            into, only to be specified if `include_top` is True, and
+            if no `weights` argument is specified.
+        activation: Type of activation at the top layer. Can be one of
+            'softmax' or 'sigmoid'. Note that if sigmoid is used,
+             classes must be 1.
+
+    # Returns
+        A Keras model instance.
+
+    # Raises
+        ValueError: in case of invalid argument for `weights`,
+            or invalid input shape.
+    '''
+
+    if weights not in {'imagenet', None}:
         raise ValueError('The `weights` argument should be either '
-                         '`None` (random initialization) or `cifar10` '
-                         '(pre-training on CIFAR-10).')
+                         '`None` (random initialization) or `imagenet` '
+                         '(pre-training on ImageNet).')
 
-    if weights == 'cifar10' and include_top and classes != 10:
-        raise ValueError('If using `weights` as CIFAR 10 with `include_top`'
-                         ' as true, `classes` should be 10')
+    if weights == 'imagenet' and include_top and classes != 1000:
+        raise ValueError('If using `weights` as ImageNet with `include_top` '
+                         'as true, `classes` should be 1000')
 
     if activation not in ['softmax', 'sigmoid']:
         raise ValueError('activation must be one of "softmax" or "sigmoid"')
@@ -106,7 +219,7 @@ def DenseNet(input_shape=None, depth=40, nb_dense_block=3, growth_rate=12, nb_fi
                                       default_size=32,
                                       min_size=8,
                                       data_format=K.image_data_format(),
-                                      include_top=include_top)
+                                      require_flatten=include_top)
 
     if input_tensor is None:
         img_input = Input(shape=input_shape)
@@ -117,8 +230,9 @@ def DenseNet(input_shape=None, depth=40, nb_dense_block=3, growth_rate=12, nb_fi
             img_input = input_tensor
 
     x = __create_dense_net(classes, img_input, include_top, depth, nb_dense_block,
-                           growth_rate, nb_filter, nb_layers_per_block, bottleneck, reduction,
-                           dropout_rate, weight_decay, activation)
+                           growth_rate, nb_filter, nb_layers_per_block, bottleneck,
+                           reduction, dropout_rate, weight_decay, subsample_initial_block,
+                           pooling, activation)
 
     # Ensure that the model takes into account
     # any potential predecessors of `input_tensor`.
@@ -130,47 +244,69 @@ def DenseNet(input_shape=None, depth=40, nb_dense_block=3, growth_rate=12, nb_fi
     model = Model(inputs, x, name='densenet')
 
     # load weights
-    if weights == 'cifar10':
-        if (depth == 40) and (nb_dense_block == 3) and (growth_rate == 12) and (nb_filter == 16) and \
-                (bottleneck is False) and (reduction == 0.0) and (dropout_rate == 0.0) and (weight_decay == 1E-4):
-            # Default parameters match. Weights for this model exist:
+    if weights == 'imagenet':
+        weights_loaded = False
 
-            if K.image_data_format() == 'channels_first':
-                if include_top:
-                    weights_path = get_file('densenet_40_12_th_dim_ordering_th_kernels.h5',
-                                            TH_WEIGHTS_PATH,
-                                            cache_subdir='models')
-                else:
-                    weights_path = get_file('densenet_40_12_th_dim_ordering_th_kernels_no_top.h5',
-                                            TH_WEIGHTS_PATH_NO_TOP,
-                                            cache_subdir='models')
-
-                model.load_weights(weights_path)
-
-                if K.backend() == 'tensorflow':
-                    warnings.warn('You are using the TensorFlow backend, yet you '
-                                  'are using the Theano '
-                                  'image dimension ordering convention '
-                                  '(`image_data_format="channels_first"`). '
-                                  'For best performance, set '
-                                  '`image_data_format="channels_last"` in '
-                                  'your Keras config '
-                                  'at ~/.keras/keras.json.')
-                    convert_all_kernels_in_model(model)
+        if (depth == 121) and (nb_dense_block == 4) and (growth_rate == 32) and (nb_filter == 64) and \
+                (bottleneck is True) and (reduction == 0.5) and (dropout_rate == 0.0) and (subsample_initial_block):
+            if include_top:
+                weights_path = get_file('DenseNet-BC-121-32.h5',
+                                        DENSENET_121_WEIGHTS_PATH,
+                                        cache_subdir='models',
+                                        md5_hash='a439dd41aa672aef6daba4ee1fd54abd')
             else:
-                if include_top:
-                    weights_path = get_file('densenet_40_12_tf_dim_ordering_tf_kernels.h5',
-                                            TF_WEIGHTS_PATH,
-                                            cache_subdir='models')
-                else:
-                    weights_path = get_file('densenet_40_12_tf_dim_ordering_tf_kernels_no_top.h5',
-                                            TF_WEIGHTS_PATH_NO_TOP,
-                                            cache_subdir='models')
+                weights_path = get_file('DenseNet-BC-121-32-no-top.h5',
+                                        DENSENET_121_WEIGHTS_PATH_NO_TOP,
+                                        cache_subdir='models',
+                                        md5_hash='8804bcb37da5be4a52dc4e45d4425ba7')
+            model.load_weights(weights_path)
+            weights_loaded = True
 
-                model.load_weights(weights_path)
+        if (depth == 161) and (nb_dense_block == 4) and (growth_rate == 48) and (nb_filter == 96) and \
+                (bottleneck is True) and (reduction == 0.5) and (dropout_rate == 0.0) and (subsample_initial_block):
+            if include_top:
+                weights_path = get_file('DenseNet-BC-161-48.h5',
+                                        DENSENET_161_WEIGHTS_PATH,
+                                        cache_subdir='models',
+                                        md5_hash='6c326cf4fbdb57d31eff04333a23fcca')
+            else:
+                weights_path = get_file('DenseNet-BC-161-48-no-top.h5',
+                                        DENSENET_161_WEIGHTS_PATH_NO_TOP,
+                                        cache_subdir='models',
+                                        md5_hash='d38903b8732fe238c91dac7859271f26')
+            model.load_weights(weights_path)
+            weights_loaded = True
 
-                if K.backend() == 'theano':
-                    convert_all_kernels_in_model(model)
+        if (depth == 169) and (nb_dense_block == 4) and (growth_rate == 32) and (nb_filter == 64) and \
+                (bottleneck is True) and (reduction == 0.5) and (dropout_rate == 0.0) and (subsample_initial_block):
+            if include_top:
+                weights_path = get_file('DenseNet-BC-169-32.h5',
+                                        DENSENET_169_WEIGHTS_PATH,
+                                        cache_subdir='models',
+                                        md5_hash='914869c361303d2e39dec640b4e606a6')
+            else:
+                weights_path = get_file('DenseNet-BC-169-32-no-top.h5',
+                                        DENSENET_169_WEIGHTS_PATH_NO_TOP,
+                                        cache_subdir='models',
+                                        md5_hash='a664d78a30ddd217dd38c0bb8d258461')
+            model.load_weights(weights_path)
+            weights_loaded = True
+
+        if weights_loaded:
+            if K.backend() == 'theano':
+                convert_all_kernels_in_model(model)
+
+            if K.image_data_format() == 'channels_first' and K.backend() == 'tensorflow':
+                warnings.warn('You are using the TensorFlow backend, yet you '
+                              'are using the Theano '
+                              'image data format convention '
+                              '(`image_data_format="channels_first"`). '
+                              'For best performance, set '
+                              '`image_data_format="channels_last"` in '
+                              'your Keras config '
+                              'at ~/.keras/keras.json.')
+
+            print("Weights for the model were loaded successfully")
 
     return model
 
@@ -297,95 +433,182 @@ def DenseNetFCN(input_shape, nb_dense_block=5, growth_rate=16, nb_layers_per_blo
     return model
 
 
-def __conv_block(ip, nb_filter, bottleneck=False, dropout_rate=None, weight_decay=1E-4):
-    ''' Apply BatchNorm, Relu, 3x3 Conv2D, optional bottleneck block and dropout
-    Args:
-        ip: Input keras tensor
-        nb_filter: number of filters
-        bottleneck: add bottleneck block
+def DenseNetImageNet121(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights='imagenet',
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=121, nb_dense_block=4, growth_rate=32, nb_filter=64,
+                    nb_layers_per_block=[6, 12, 24, 16], bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True,
+                    include_top=include_top, weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+
+
+def DenseNetImageNet169(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights='imagenet',
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=169, nb_dense_block=4, growth_rate=32, nb_filter=64,
+                    nb_layers_per_block=[6, 12, 32, 32], bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True,
+                    include_top=include_top, weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+
+
+def DenseNetImageNet201(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights=None,
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=201, nb_dense_block=4, growth_rate=32, nb_filter=64,
+                    nb_layers_per_block=[6, 12, 48, 32], bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True,
+                    include_top=include_top, weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+
+
+def DenseNetImageNet264(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights=None,
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=201, nb_dense_block=4, growth_rate=32, nb_filter=64,
+                    nb_layers_per_block=[6, 12, 64, 48], bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True,
+                    include_top=include_top, weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+
+
+def DenseNetImageNet161(input_shape=None,
+                        bottleneck=True,
+                        reduction=0.5,
+                        dropout_rate=0.0,
+                        weight_decay=1e-4,
+                        include_top=True,
+                        weights='imagenet',
+                        input_tensor=None,
+                        pooling=None,
+                        classes=1000,
+                        activation='softmax'):
+    return DenseNet(input_shape, depth=161, nb_dense_block=4, growth_rate=48, nb_filter=96,
+                    nb_layers_per_block=[6, 12, 36, 24], bottleneck=bottleneck, reduction=reduction,
+                    dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True,
+                    include_top=include_top, weights=weights, input_tensor=input_tensor,
+                    pooling=pooling, classes=classes, activation=activation)
+
+
+def __conv_block(ip, nb_filter, bottleneck=False, dropout_rate=None, weight_decay=1e-4):
+    '''
+    Adds a convolution layer (with batch normalization and relu),
+    and optionally a bottleneck layer.
+
+    # Arguments
+        ip: Input tensor
+        nb_filter: integer, the dimensionality of the output space
+            (i.e. the number output of filters in the convolution)
+        bottleneck: if True, adds a bottleneck convolution block
         dropout_rate: dropout rate
         weight_decay: weight decay factor
-    Returns: keras tensor with batch_norm, relu and convolution2d added (optional bottleneck)
-    '''
 
+     # Input shape
+        4D tensor with shape:
+        `(samples, channels, rows, cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+    # Output shape
+        4D tensor with shape:
+        `(samples, filters, new_rows, new_cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, new_rows, new_cols, filters)` if data_format='channels_last'.
+        `rows` and `cols` values might have changed due to stride.
+
+    # Returns
+        output tensor of block
+    '''
     concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
 
-    x = BatchNormalization(axis=concat_axis, gamma_regularizer=l2(weight_decay),
-                           beta_regularizer=l2(weight_decay))(ip)
+    x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5)(ip)
     x = Activation('relu')(x)
 
     if bottleneck:
-        inter_channel = nb_filter * 4  # Obtained from https://github.com/liuzhuang13/DenseNet/blob/master/densenet.lua
+        inter_channel = nb_filter * 4
 
-        x = Conv2D(inter_channel, (1, 1), kernel_initializer='he_uniform', padding='same', use_bias=False,
+        x = Conv2D(inter_channel, (1, 1), kernel_initializer='he_normal', padding='same', use_bias=False,
                    kernel_regularizer=l2(weight_decay))(x)
-
-        if dropout_rate:
-            x = Dropout(dropout_rate)(x)
-
-        x = BatchNormalization(axis=concat_axis, gamma_regularizer=l2(weight_decay),
-                               beta_regularizer=l2(weight_decay))(x)
+        x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5)(x)
         x = Activation('relu')(x)
 
-    x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_uniform', padding='same', use_bias=False,
-               kernel_regularizer=l2(weight_decay))(x)
+    x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_normal', padding='same', use_bias=False)(x)
     if dropout_rate:
         x = Dropout(dropout_rate)(x)
 
     return x
 
 
-def __transition_block(ip, nb_filter, compression=1.0, dropout_rate=None, weight_decay=1E-4):
-    ''' Apply BatchNorm, Relu 1x1, Conv2D, optional compression, dropout and Maxpooling2D
-    Args:
-        ip: keras tensor
-        nb_filter: number of filters
-        compression: calculated as 1 - reduction. Reduces the number of feature maps
-                    in the transition block.
+def __dense_block(x, nb_layers, nb_filter, growth_rate, bottleneck=False, dropout_rate=None,
+                  weight_decay=1e-4, grow_nb_filters=True, return_concat_list=False):
+    '''
+    Build a dense_block where the output of each conv_block is fed
+    to subsequent ones
+
+    # Arguments
+        x: input keras tensor
+        nb_layers: the number of conv_blocks to append to the model
+        nb_filter: integer, the dimensionality of the output space
+            (i.e. the number output of filters in the convolution)
+        growth_rate: growth rate of the dense block
+        bottleneck: if True, adds a bottleneck convolution block to
+            each conv_block
         dropout_rate: dropout rate
         weight_decay: weight decay factor
-    Returns: keras tensor, after applying batch_norm, relu-conv, dropout, maxpool
+        grow_nb_filters: if True, allows number of filters to grow
+        return_concat_list: set to True to return the list of
+            feature maps along with the actual output
+
+    # Return
+        If return_concat_list is True, returns a list of the output
+        keras tensor, the number of filters and a list of all the
+        dense blocks added to the keras tensor
+
+        If return_concat_list is False, returns a list of the output
+        keras tensor and the number of filters
     '''
-
-    concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
-
-    x = BatchNormalization(axis=concat_axis, gamma_regularizer=l2(weight_decay),
-                           beta_regularizer=l2(weight_decay))(ip)
-    x = Activation('relu')(x)
-    x = Conv2D(int(nb_filter * compression), (1, 1), kernel_initializer='he_uniform', padding='same', use_bias=False,
-               kernel_regularizer=l2(weight_decay))(x)
-    if dropout_rate:
-        x = Dropout(dropout_rate)(x)
-    x = AveragePooling2D((2, 2), strides=(2, 2))(x)
-
-    return x
-
-
-def __dense_block(x, nb_layers, nb_filter, growth_rate, bottleneck=False, dropout_rate=None, weight_decay=1E-4,
-                  grow_nb_filters=True, return_concat_list=False):
-    ''' Build a dense_block where the output of each conv_block is fed to subsequent ones
-    Args:
-        x: keras tensor
-        nb_layers: the number of layers of conv_block to append to the model.
-        nb_filter: number of filters
-        growth_rate: growth rate
-        bottleneck: bottleneck block
-        dropout_rate: dropout rate
-        weight_decay: weight decay factor
-        grow_nb_filters: flag to decide to allow number of filters to grow
-        return_concat_list: return the list of feature maps along with the actual output
-    Returns: keras tensor with nb_layers of conv_block appended
-    '''
-
     concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
 
     x_list = [x]
 
     for i in range(nb_layers):
-        conv_block = __conv_block(x, growth_rate, bottleneck, dropout_rate, weight_decay)
-        x_list.append(conv_block)
+        cb = __conv_block(x, growth_rate, bottleneck, dropout_rate, weight_decay)
+        x_list.append(cb)
 
-        x = concatenate(x_list, axis=concat_axis)
+        x = concatenate([x, cb], axis=concat_axis)
 
         if grow_nb_filters:
             nb_filter += growth_rate
@@ -396,36 +619,96 @@ def __dense_block(x, nb_layers, nb_filter, growth_rate, bottleneck=False, dropou
         return x, nb_filter
 
 
-def __transition_up_block(ip, nb_filters, type='upsampling', weight_decay=1E-4):
-    ''' SubpixelConvolutional Upscaling (factor = 2)
-    Args:
-        ip: keras tensor
-        nb_filters: number of layers
-        type: can be 'upsampling', 'subpixel', 'deconv'. Determines type of upsampling performed
+def __transition_block(ip, nb_filter, compression=1.0, weight_decay=1e-4):
+    '''
+    Adds a pointwise convolution layer (with batch normalization and relu),
+    and an average pooling layer. The number of output convolution filters
+    can be reduced by appropriately reducing the compression parameter.
+
+    # Arguments
+        ip: input keras tensor
+        nb_filter: integer, the dimensionality of the output space
+            (i.e. the number output of filters in the convolution)
+        compression: calculated as 1 - reduction. Reduces the number
+            of feature maps in the transition block.
         weight_decay: weight decay factor
-    Returns: keras tensor, after applying upsampling operation.
+
+    # Input shape
+        4D tensor with shape:
+        `(samples, channels, rows, cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+    # Output shape
+        4D tensor with shape:
+        `(samples, nb_filter * compression, rows / 2, cols / 2)`
+        if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows / 2, cols / 2, nb_filter * compression)`
+        if data_format='channels_last'.
+
+    # Returns
+        a keras tensor
+    '''
+    concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
+
+    x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5)(ip)
+    x = Activation('relu')(x)
+    x = Conv2D(int(nb_filter * compression), (1, 1), kernel_initializer='he_normal', padding='same',
+               use_bias=False, kernel_regularizer=l2(weight_decay))(x)
+    x = AveragePooling2D((2, 2), strides=(2, 2))(x)
+
+    return x
+
+
+def __transition_up_block(ip, nb_filters, type='deconv', weight_decay=1E-4):
+    '''Adds an upsampling block. Upsampling operation relies on the the type parameter.
+
+    # Arguments
+        ip: input keras tensor
+        nb_filters: integer, the dimensionality of the output space
+            (i.e. the number output of filters in the convolution)
+        type: can be 'upsampling', 'subpixel', 'deconv'. Determines
+            type of upsampling performed
+        weight_decay: weight decay factor
+
+    # Input shape
+        4D tensor with shape:
+        `(samples, channels, rows, cols)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows, cols, channels)` if data_format='channels_last'.
+
+    # Output shape
+        4D tensor with shape:
+        `(samples, nb_filter, rows * 2, cols * 2)` if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, rows * 2, cols * 2, nb_filter)` if data_format='channels_last'.
+
+    # Returns
+        a keras tensor
     '''
 
     if type == 'upsampling':
         x = UpSampling2D()(ip)
     elif type == 'subpixel':
-        x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', W_regularizer=l2(weight_decay),
-                   use_bias=False, kernel_initializer='he_uniform')(ip)
+        x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay),
+                   use_bias=False, kernel_initializer='he_normal')(ip)
         x = SubPixelUpscaling(scale_factor=2)(x)
-        x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', W_regularizer=l2(weight_decay),
-                   use_bias=False, kernel_initializer='he_uniform')(x)
+        x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay),
+                   use_bias=False, kernel_initializer='he_normal')(x)
     else:
         x = Conv2DTranspose(nb_filters, (3, 3), activation='relu', padding='same', strides=(2, 2),
-                            kernel_initializer='he_uniform')(ip)
+                            kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(ip)
 
     return x
 
 
 def __create_dense_net(nb_classes, img_input, include_top, depth=40, nb_dense_block=3, growth_rate=12, nb_filter=-1,
-                       nb_layers_per_block=-1, bottleneck=False, reduction=0.0, dropout_rate=None, weight_decay=1E-4,
-                       activation='softmax'):
+                       nb_layers_per_block=-1, bottleneck=False, reduction=0.0, dropout_rate=None, weight_decay=1e-4,
+                       subsample_initial_block=False, pooling=None, activation='softmax'):
     ''' Build the DenseNet model
-    Args:
+
+    # Arguments
         nb_classes: number of classes
         img_input: tuple of shape (channels, rows, columns) or (rows, columns, channels)
         include_top: flag to include the final Dense layer
@@ -442,28 +725,51 @@ def __create_dense_net(nb_classes, img_input, include_top, depth=40, nb_dense_bl
         bottleneck: add bottleneck blocks
         reduction: reduction factor of transition blocks. Note : reduction value is inverted to compute compression
         dropout_rate: dropout rate
-        weight_decay: weight decay
+        weight_decay: weight decay rate
+        subsample_initial_block: Set to True to subsample the initial convolution and
+                add a MaxPool2D before the dense blocks are added.
+        pooling: Optional pooling mode for feature extraction
+            when `include_top` is `False`.
+            - `None` means that the output of the model
+                will be the 4D tensor output of the
+                last convolutional layer.
+            - `avg` means that global average pooling
+                will be applied to the output of the
+                last convolutional layer, and thus
+                the output of the model will be a
+                2D tensor.
+            - `max` means that global max pooling will
+                be applied.
         activation: Type of activation at the top layer. Can be one of 'softmax' or 'sigmoid'.
                 Note that if sigmoid is used, classes must be 1.
-    Returns: keras tensor with nb_layers of conv_block appended
+
+    # Returns
+        a keras tensor
+
+    # Raises
+        ValueError: in case of invalid argument for `reduction`
+            or `nb_dense_block`
     '''
 
     concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
 
-    assert (depth - 4) % 3 == 0, 'Depth must be 3 N + 4'
     if reduction != 0.0:
-        assert reduction <= 1.0 and reduction > 0.0, 'reduction value must lie between 0.0 and 1.0'
+        if not (reduction <= 1.0 and reduction > 0.0):
+            raise ValueError('`reduction` value must lie between 0.0 and 1.0')
 
     # layers in each dense block
     if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple:
         nb_layers = list(nb_layers_per_block)  # Convert tuple to list
 
-        assert len(nb_layers) == (nb_dense_block + 1), 'If list, nb_layer is used as provided. ' \
-                                                       'Note that list size must be (nb_dense_block + 1)'
+        if len(nb_layers) != (nb_dense_block):
+            raise ValueError('If `nb_dense_block` is a list, its length must match '
+                             'the number of layers provided by `nb_layers`.')
+
         final_nb_layer = nb_layers[-1]
         nb_layers = nb_layers[:-1]
     else:
         if nb_layers_per_block == -1:
+            assert (depth - 4) % 3 == 0, 'Depth must be 3 N + 4 if nb_layers_per_block == -1'
             count = int((depth - 4) / 3)
             nb_layers = [count for _ in range(nb_dense_block)]
             final_nb_layer = count
@@ -471,9 +777,6 @@ def __create_dense_net(nb_classes, img_input, include_top, depth=40, nb_dense_bl
             final_nb_layer = nb_layers_per_block
             nb_layers = [nb_layers_per_block] * nb_dense_block
 
-    if bottleneck:
-        nb_layers = [int(layer // 2) for layer in nb_layers]
-
     # compute initial nb_filter if -1, else accept users initial nb_filter
     if nb_filter <= 0:
         nb_filter = 2 * growth_rate
@@ -482,39 +785,55 @@ def __create_dense_net(nb_classes, img_input, include_top, depth=40, nb_dense_bl
     compression = 1.0 - reduction
 
     # Initial convolution
-    x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_uniform', padding='same', name='initial_conv2D',
-               use_bias=False, kernel_regularizer=l2(weight_decay))(img_input)
+    if subsample_initial_block:
+        initial_kernel = (7, 7)
+        initial_strides = (2, 2)
+    else:
+        initial_kernel = (3, 3)
+        initial_strides = (1, 1)
+
+    x = Conv2D(nb_filter, initial_kernel, kernel_initializer='he_normal', padding='same',
+               strides=initial_strides, use_bias=False, kernel_regularizer=l2(weight_decay))(img_input)
+
+    if subsample_initial_block:
+        x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5)(x)
+        x = Activation('relu')(x)
+        x = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
 
     # Add dense blocks
     for block_idx in range(nb_dense_block - 1):
         x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter, growth_rate, bottleneck=bottleneck,
                                      dropout_rate=dropout_rate, weight_decay=weight_decay)
         # add transition_block
-        x = __transition_block(x, nb_filter, compression=compression, dropout_rate=dropout_rate,
-                               weight_decay=weight_decay)
+        x = __transition_block(x, nb_filter, compression=compression, weight_decay=weight_decay)
         nb_filter = int(nb_filter * compression)
 
     # The last dense_block does not have a transition_block
     x, nb_filter = __dense_block(x, final_nb_layer, nb_filter, growth_rate, bottleneck=bottleneck,
                                  dropout_rate=dropout_rate, weight_decay=weight_decay)
 
-    x = BatchNormalization(axis=concat_axis, gamma_regularizer=l2(weight_decay),
-                           beta_regularizer=l2(weight_decay))(x)
+    x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5)(x)
     x = Activation('relu')(x)
-    x = GlobalAveragePooling2D()(x)
 
     if include_top:
-        x = Dense(nb_classes, activation=activation, W_regularizer=l2(weight_decay), b_regularizer=l2(weight_decay))(x)
+        x = GlobalAveragePooling2D()(x)
+        x = Dense(nb_classes, activation=activation)(x)
+    else:
+        if pooling == 'avg':
+            x = GlobalAveragePooling2D()(x)
+        if pooling == 'max':
+            x = GlobalMaxPooling2D()(x)
 
     return x
 
 
 def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5, growth_rate=12,
-                           reduction=0.0, dropout_rate=None, weight_decay=1E-4,
-                           nb_layers_per_block=4, nb_upsampling_conv=128, upsampling_type='deconv',
-                           init_conv_filters=48, input_shape=None, activation='softmax'):
-    ''' Build the DenseNet model
-    Args:
+                           reduction=0.0, dropout_rate=None, weight_decay=1e-4,
+                           nb_layers_per_block=4, nb_upsampling_conv=128, upsampling_type='upsampling',
+                           init_conv_filters=48, input_shape=None, activation='deconv'):
+    ''' Build the DenseNet-FCN model
+
+    # Arguments
         nb_classes: number of classes
         img_input: tuple of shape (channels, rows, columns) or (rows, columns, channels)
         include_top: flag to include the final Dense layer
@@ -534,7 +853,13 @@ def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5,
         input_shape: Only used for shape inference in fully convolutional networks.
         activation: Type of activation at the top layer. Can be one of 'softmax' or 'sigmoid'.
                     Note that if sigmoid is used, classes must be 1.
-    Returns: keras tensor with nb_layers of conv_block appended
+
+    # Returns
+        a keras tensor
+
+    # Raises
+        ValueError: in case of invalid argument for `reduction`,
+            `nb_dense_block` or `nb_upsampling_conv`.
     '''
 
     concat_axis = 1 if K.image_data_format() == 'channels_first' else -1
@@ -545,20 +870,22 @@ def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5,
         rows, cols, _ = input_shape
 
     if reduction != 0.0:
-        assert reduction <= 1.0 and reduction > 0.0, 'reduction value must lie between 0.0 and 1.0'
+        if not (reduction <= 1.0 and reduction > 0.0):
+            raise ValueError('`reduction` value must lie between 0.0 and 1.0')
 
     # check if upsampling_conv has minimum number of filters
     # minimum is set to 12, as at least 3 color channels are needed for correct upsampling
-    assert nb_upsampling_conv > 12 and nb_upsampling_conv % 4 == 0, 'Parameter `upsampling_conv` number of channels must ' \
-                                                                    'be a positive number divisible by 4 and greater ' \
-                                                                    'than 12'
+    if not (nb_upsampling_conv > 12 and nb_upsampling_conv % 4 == 0):
+        raise ValueError('Parameter `nb_upsampling_conv` number of channels must '
+                         'be a positive number divisible by 4 and greater than 12')
 
     # layers in each dense block
     if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple:
         nb_layers = list(nb_layers_per_block)  # Convert tuple to list
 
-        assert len(nb_layers) == (nb_dense_block + 1), 'If list, nb_layer is used as provided. ' \
-                                                       'Note that list size must be (nb_dense_block + 1)'
+        if len(nb_layers) != (nb_dense_block + 1):
+            raise ValueError('If `nb_dense_block` is a list, its length must be '
+                             '(`nb_dense_block` + 1)')
 
         bottleneck_nb_layers = nb_layers[-1]
         rev_layers = nb_layers[::-1]
@@ -571,8 +898,10 @@ def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5,
     compression = 1.0 - reduction
 
     # Initial convolution
-    x = Conv2D(init_conv_filters, (3, 3), kernel_initializer='he_uniform', padding='same', name='initial_conv2D',
+    x = Conv2D(init_conv_filters, (7, 7), kernel_initializer='he_normal', padding='same', name='initial_conv2D',
                use_bias=False, kernel_regularizer=l2(weight_decay))(img_input)
+    x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5)(x)
+    x = Activation('relu')(x)
 
     nb_filter = init_conv_filters
 
@@ -580,15 +909,14 @@ def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5,
 
     # Add dense blocks and transition down block
     for block_idx in range(nb_dense_block):
-        x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter, growth_rate,
-                                     dropout_rate=dropout_rate, weight_decay=weight_decay)
+        x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter, growth_rate, dropout_rate=dropout_rate,
+                                     weight_decay=weight_decay)
 
         # Skip connection
         skip_list.append(x)
 
         # add transition_block
-        x = __transition_block(x, nb_filter, compression=compression, dropout_rate=dropout_rate,
-                               weight_decay=weight_decay)
+        x = __transition_block(x, nb_filter, compression=compression, weight_decay=weight_decay)
 
         nb_filter = int(nb_filter * compression)  # this is calculated inside transition_down_block
 
@@ -608,7 +936,7 @@ def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5,
         # not the concatenation of the input with the feature maps (concat_list[0].
         l = concatenate(concat_list[1:], axis=concat_axis)
 
-        t = __transition_up_block(l, nb_filters=n_filters_keep, type=upsampling_type)
+        t = __transition_up_block(l, nb_filters=n_filters_keep, type=upsampling_type, weight_decay=weight_decay)
 
         # concatenate the skip connection with the transition block
         x = concatenate([t, skip_list[block_idx]], axis=concat_axis)
@@ -616,12 +944,11 @@ def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5,
         # Dont allow the feature map size to grow in upsampling dense blocks
         x_up, nb_filter, concat_list = __dense_block(x, nb_layers[nb_dense_block + block_idx + 1], nb_filter=growth_rate,
                                                      growth_rate=growth_rate, dropout_rate=dropout_rate,
-                                                     weight_decay=weight_decay,
-                                                     return_concat_list=True, grow_nb_filters=False)
+                                                     weight_decay=weight_decay, return_concat_list=True,
+                                                     grow_nb_filters=False)
 
     if include_top:
-        x = Conv2D(nb_classes, (1, 1), activation='linear', padding='same', kernel_regularizer=l2(weight_decay),
-                   use_bias=False)(x_up)
+        x = Conv2D(nb_classes, (1, 1), activation='linear', padding='same', use_bias=False)(x_up)
 
         if K.image_data_format() == 'channels_first':
             channel, row, col = input_shape