diff --git a/.travis.yml b/.travis.yml index 3edc3ee..be93c59 100644 --- a/.travis.yml +++ b/.travis.yml @@ -32,25 +32,39 @@ install: - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy matplotlib pandas pytest h5py - source activate test-environment - - pip install pytest-cov python-coveralls pytest-xdist coverage==3.7.1 #we need this version of coverage for coveralls.io to work + - pip install pytest-cov pytest-xdist - pip install pep8 pytest-pep8 + - conda install mkl mkl-service - pip install theano - pip install git+git://github.com/fchollet/keras.git # install PIL for preprocessing tests - #- if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then - # conda install pil; - # elif [[ "$TRAVIS_PYTHON_VERSION" == "3.5" ]]; then - # conda install Pillow; - # fi + - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then + conda install pil; + elif [[ "$TRAVIS_PYTHON_VERSION" == "3.5" ]]; then + conda install Pillow; + fi - - python setup.py install + - pip install -e .[tests] - # install TensorFlow (CPU) + # install TensorFlow (CPU version). - pip install tensorflow + + # install cntk + - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then + pip install https://cntk.ai/PythonWheel/CPU-Only/cntk-2.2-cp27-cp27mu-linux_x86_64.whl; + elif [[ "$TRAVIS_PYTHON_VERSION" == "3.5" ]]; then + pip install https://cntk.ai/PythonWheel/CPU-Only/cntk-2.2-cp35-cp35m-linux_x86_64.whl; + fi + + # install pydot for visualization tests + - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then + conda install pydot graphviz; + fi # command to run tests script: + - export MKL_THREADING_LAYER="GNU" # run keras backend init to initialize backend config - python -c "import keras.backend" # create dataset directory to avoid concurrent directory creation at runtime @@ -61,7 +75,5 @@ script: - if [[ "$TEST_MODE" == "PEP8" ]]; then PYTHONPATH=$PWD:$PYTHONPATH py.test --pep8 -m pep8 -n0; else - PYTHONPATH=$PWD:$PYTHONPATH py.test tests/; + PYTHONPATH=$PWD:$PYTHONPATH py.test tests/ --ignore=tests/integration_tests --ignore=tests/test_documentation.py --cov=keras tests/ --cov-report term-missing; fi -after_success: - - coveralls diff --git a/GUIDELINES.md b/GUIDELINES.md index bb95dcd..df0bed9 100644 --- a/GUIDELINES.md +++ b/GUIDELINES.md @@ -2,6 +2,7 @@ ## Maintainers: Following are the users with write-access to this repository (maintainers) : +* [athundt](https://www.github.com/athundt) * [bstriner](https://www.github.com/bstriner) * [farizrahman4u](https://www.github.com/farizrahman4u) * [fchollet](https://www.github.com/fchollet) diff --git a/examples/cifar10_densenet.py b/examples/cifar10_densenet.py index 79c6c3d..84e2a33 100644 --- a/examples/cifar10_densenet.py +++ b/examples/cifar10_densenet.py @@ -33,8 +33,11 @@ nb_filter = 16 dropout_rate = 0.0 # 0.0 for data augmentation # Create the model (without loading weights) -model = DenseNet(depth, nb_dense_block, growth_rate, nb_filter, dropout_rate=dropout_rate, - input_shape=img_dim, weights=None) +model = DenseNet(depth=depth, nb_dense_block=nb_dense_block, + growth_rate=growth_rate, nb_filter=nb_filter, + dropout_rate=dropout_rate, + input_shape=img_dim, + weights=None) print('Model created') model.summary() diff --git a/examples/cifar10_nasnet.py b/examples/cifar10_nasnet.py new file mode 100644 index 0000000..56c75ee --- /dev/null +++ b/examples/cifar10_nasnet.py @@ -0,0 +1,106 @@ +""" +Adapted from keras example cifar10_cnn.py +Train NASNet-CIFAR on the CIFAR10 small images dataset. +""" +from __future__ import print_function +from keras.datasets import cifar10 +from keras.preprocessing.image import ImageDataGenerator +from keras.utils import np_utils +from keras.callbacks import ModelCheckpoint +from keras.callbacks import ReduceLROnPlateau +from keras.callbacks import CSVLogger +from keras.optimizers import Adam +from keras_contrib.applications.nasnet import NASNetCIFAR, preprocess_input + +import numpy as np + + +weights_file = 'NASNet-CIFAR-10.h5' +lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.5), cooldown=0, patience=5, min_lr=0.5e-5) +csv_logger = CSVLogger('NASNet-CIFAR-10.csv') +model_checkpoint = ModelCheckpoint(weights_file, monitor='val_predictions_acc', save_best_only=True, + save_weights_only=True, mode='max') + +batch_size = 128 +nb_classes = 10 +nb_epoch = 600 +data_augmentation = True + +# input image dimensions +img_rows, img_cols = 32, 32 +# The CIFAR10 images are RGB. +img_channels = 3 + +# The data, shuffled and split between train and test sets: +(X_train, y_train), (X_test, y_test) = cifar10.load_data() + +# Convert class vectors to binary class matrices. +Y_train = np_utils.to_categorical(y_train, nb_classes) +Y_test = np_utils.to_categorical(y_test, nb_classes) + +X_train = X_train.astype('float32') +X_test = X_test.astype('float32') + +# preprocess input +X_train = preprocess_input(X_train) +X_test = preprocess_input(X_test) + +# For training, the auxilary branch must be used to correctly train NASNet +model = NASNetCIFAR((img_rows, img_cols, img_channels), use_auxilary_branch=True) +model.summary() + +optimizer = Adam(lr=1e-3, clipnorm=5) +model.compile(loss=['categorical_crossentropy', 'categorical_crossentropy'], + optimizer=optimizer, metrics=['accuracy'], loss_weights=[1.0, 0.4]) + +# model.load_weights('NASNet-CIFAR-10.h5', by_name=True) + +if not data_augmentation: + print('Not using data augmentation.') + model.fit(X_train, [Y_train, Y_train], + batch_size=batch_size, + epochs=nb_epoch, + validation_data=(X_test, [Y_test, Y_test]), + shuffle=True, + verbose=2, + callbacks=[lr_reducer, csv_logger, model_checkpoint]) +else: + print('Using real-time data augmentation.') + # This will do preprocessing and realtime data augmentation: + datagen = ImageDataGenerator( + featurewise_center=False, # set input mean to 0 over the dataset + samplewise_center=False, # set each sample mean to 0 + featurewise_std_normalization=False, # divide inputs by std of the dataset + samplewise_std_normalization=False, # divide each input by its std + zca_whitening=False, # apply ZCA whitening + rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180) + width_shift_range=0.1, # randomly shift images horizontally (fraction of total width) + height_shift_range=0.1, # randomly shift images vertically (fraction of total height) + horizontal_flip=True, # randomly flip images + vertical_flip=False) # randomly flip images + + # Compute quantities required for featurewise normalization + # (std, mean, and principal components if ZCA whitening is applied). + datagen.fit(X_train) + + # wrap the ImageDataGenerator to yield two label batches [y, y] for each input batch X + # When training a NASNet model, we have to use its auxilary training head + # Therefore the model is technically a 1 input - 2 output model, and requires + # the label to be duplicated for the auxilary head + def image_data_generator_wrapper(image_datagenerator, batch_size): + iterator = datagen.flow(X_train, Y_train, batch_size=batch_size) + + while True: + X, y = next(iterator) # get the next batch + yield X, [y, y] # duplicate the labels for each batch + + # Fit the model on the batches generated by datagen.flow(). + model.fit_generator(image_data_generator_wrapper(datagen, batch_size), + steps_per_epoch=X_train.shape[0] // batch_size, + validation_data=(X_test, [Y_test, Y_test]), + epochs=nb_epoch, verbose=2, + callbacks=[lr_reducer, csv_logger, model_checkpoint]) + +scores = model.evaluate(X_test, [Y_test, Y_test], batch_size=batch_size) +for score, metric_name in zip(scores, model.metrics_names): + print("%s : %0.4f" % (metric_name, score)) diff --git a/examples/cifar10_resnet.py b/examples/cifar10_resnet.py new file mode 100644 index 0000000..edb6384 --- /dev/null +++ b/examples/cifar10_resnet.py @@ -0,0 +1,96 @@ +""" +Adapted from keras example cifar10_cnn.py and github.com/raghakot/keras-resnet +Train ResNet-18 on the CIFAR10 small images dataset. + +GPU run command with Theano backend (with TensorFlow, the GPU is automatically used): + THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python cifar10.py +""" +from __future__ import print_function +from keras.datasets import cifar10 +from keras.preprocessing.image import ImageDataGenerator +from keras.utils import np_utils +from keras.callbacks import ModelCheckpoint +from keras.callbacks import ReduceLROnPlateau +from keras.callbacks import CSVLogger +from keras.callbacks import EarlyStopping +from keras_contrib.applications.resnet import ResNet18 + +import numpy as np + + +weights_file = 'ResNet18v2-CIFAR-10.h5' +lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=5, min_lr=0.5e-6) +early_stopper = EarlyStopping(min_delta=0.001, patience=10) +csv_logger = CSVLogger('ResNet18v2-CIFAR-10.csv') +model_checkpoint = ModelCheckpoint(weights_file, monitor='val_acc', save_best_only=True, + save_weights_only=True, mode='auto') + +batch_size = 32 +nb_classes = 10 +nb_epoch = 200 +data_augmentation = True + +# input image dimensions +img_rows, img_cols = 32, 32 +# The CIFAR10 images are RGB. +img_channels = 3 + +# The data, shuffled and split between train and test sets: +(X_train, y_train), (X_test, y_test) = cifar10.load_data() + +# Convert class vectors to binary class matrices. +Y_train = np_utils.to_categorical(y_train, nb_classes) +Y_test = np_utils.to_categorical(y_test, nb_classes) + +X_train = X_train.astype('float32') +X_test = X_test.astype('float32') + +# subtract mean and normalize +mean_image = np.mean(X_train, axis=0) +X_train -= mean_image +X_test -= mean_image +X_train /= 128. +X_test /= 128. + +model = ResNet18((img_rows, img_cols, img_channels), nb_classes) +model.compile(loss='categorical_crossentropy', + optimizer='adam', + metrics=['accuracy']) + +if not data_augmentation: + print('Not using data augmentation.') + model.fit(X_train, Y_train, + batch_size=batch_size, + nb_epoch=nb_epoch, + validation_data=(X_test, Y_test), + shuffle=True, + callbacks=[lr_reducer, early_stopper, csv_logger, model_checkpoint]) +else: + print('Using real-time data augmentation.') + # This will do preprocessing and realtime data augmentation: + datagen = ImageDataGenerator( + featurewise_center=False, # set input mean to 0 over the dataset + samplewise_center=False, # set each sample mean to 0 + featurewise_std_normalization=False, # divide inputs by std of the dataset + samplewise_std_normalization=False, # divide each input by its std + zca_whitening=False, # apply ZCA whitening + rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180) + width_shift_range=0.1, # randomly shift images horizontally (fraction of total width) + height_shift_range=0.1, # randomly shift images vertically (fraction of total height) + horizontal_flip=True, # randomly flip images + vertical_flip=False) # randomly flip images + + # Compute quantities required for featurewise normalization + # (std, mean, and principal components if ZCA whitening is applied). + datagen.fit(X_train) + + # Fit the model on the batches generated by datagen.flow(). + model.fit_generator(datagen.flow(X_train, Y_train, batch_size=batch_size), + steps_per_epoch=X_train.shape[0] // batch_size, + validation_data=(X_test, Y_test), + epochs=nb_epoch, verbose=2, + callbacks=[lr_reducer, early_stopper, csv_logger, model_checkpoint]) + +scores = model.evaluate(X_test, Y_test, batch_size=batch_size) +print('Test loss : ', scores[0]) +print('Test accuracy : ', scores[1]) diff --git a/keras_contrib/applications/__init__.py b/keras_contrib/applications/__init__.py index e9d829d..a1592a7 100644 --- a/keras_contrib/applications/__init__.py +++ b/keras_contrib/applications/__init__.py @@ -1,2 +1,5 @@ from .densenet import DenseNet from .ror import ResidualOfResidual +from .resnet import ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152 +from .wide_resnet import WideResidualNetwork +from .nasnet import NASNet, NASNetLarge, NASNetMobile diff --git a/keras_contrib/applications/densenet.py b/keras_contrib/applications/densenet.py index ff72ceb..b885703 100644 --- a/keras_contrib/applications/densenet.py +++ b/keras_contrib/applications/densenet.py @@ -1,8 +1,46 @@ # -*- coding: utf-8 -*- -'''DenseNet models for Keras. +'''DenseNet and DenseNet-FCN models for Keras. + +DenseNet is a network architecture where each layer is directly connected +to every other layer in a feed-forward fashion (within each dense block). +For each layer, the feature maps of all preceding layers are treated as +separate inputs whereas its own feature maps are passed on as inputs to +all subsequent layers. This connectivity pattern yields state-of-the-art +accuracies on CIFAR10/100 (with or without data augmentation) and SVHN. +On the large scale ILSVRC 2012 (ImageNet) dataset, DenseNet achieves a +similar accuracy as ResNet, but using less than half the amount of +parameters and roughly half the number of FLOPs. + +DenseNets support any input image size of 32x32 or greater, and are thus +suited for CIFAR-10 or CIFAR-100 datasets. There are two types of DenseNets, +one suited for smaller images (DenseNet) and one suited for ImageNet, +called DenseNetImageNet. They are differentiated by the strided convolution +and pooling operations prior to the initial dense block. + +The following table describes the size and accuracy of DenseNetImageNet models +on the ImageNet dataset (single crop), for which weights are provided: +------------------------------------------------------------------------------------ + Model type | ImageNet Acc (Top 1) | ImageNet Acc (Top 5) | Params (M) | +------------------------------------------------------------------------------------ +| DenseNet-121 | 25.02 % | 7.71 % | 8.0 | +| DenseNet-169 | 23.80 % | 6.85 % | 14.3 | +| DenseNet-201 | 22.58 % | 6.34 % | 20.2 | +| DenseNet-161 | 22.20 % | - % | 28.9 | +------------------------------------------------------------------------------------ + +DenseNets can be extended to image segmentation tasks as described in the +paper "The One Hundred Layers Tiramisu: Fully Convolutional DenseNets for +Semantic Segmentation". Here, the dense blocks are arranged and concatenated +with long skip connections for state of the art performance on the CamVid dataset. + # Reference - [Densely Connected Convolutional Networks](https://arxiv.org/pdf/1608.06993.pdf) - [The One Hundred Layers Tiramisu: Fully Convolutional DenseNets for Semantic Segmentation](https://arxiv.org/pdf/1611.09326.pdf) + +This implementation is based on the following reference code: + - https://github.com/gpleiss/efficient_densenet_pytorch + - https://github.com/liuzhuang13/DenseNet + ''' from __future__ import print_function from __future__ import absolute_import @@ -11,89 +49,147 @@ from __future__ import division import warnings from keras.models import Model -from keras.layers.core import Dense, Dropout, Activation, Reshape -from keras.layers.convolutional import Conv2D, Conv2DTranspose, UpSampling2D -from keras.layers.pooling import AveragePooling2D -from keras.layers.pooling import GlobalAveragePooling2D +from keras.layers import Dense +from keras.layers import Dropout +from keras.layers import Activation +from keras.layers import Reshape +from keras.layers import Conv2D +from keras.layers import Conv2DTranspose +from keras.layers import UpSampling2D +from keras.layers import MaxPooling2D +from keras.layers import AveragePooling2D +from keras.layers import GlobalMaxPooling2D +from keras.layers import GlobalAveragePooling2D from keras.layers import Input -from keras.layers.merge import concatenate -from keras.layers.normalization import BatchNormalization +from keras.layers import concatenate +from keras.layers import BatchNormalization from keras.regularizers import l2 from keras.utils.layer_utils import convert_all_kernels_in_model from keras.utils.data_utils import get_file from keras.engine.topology import get_source_inputs from keras.applications.imagenet_utils import _obtain_input_shape +from keras.applications.imagenet_utils import decode_predictions +from keras.applications.imagenet_utils import preprocess_input as _preprocess_input import keras.backend as K from keras_contrib.layers.convolutional import SubPixelUpscaling -TH_WEIGHTS_PATH = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Theano-Backend-TH-dim-ordering.h5' -TF_WEIGHTS_PATH = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Tensorflow-Backend-TF-dim-ordering.h5' -TH_WEIGHTS_PATH_NO_TOP = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Theano-Backend-TH-dim-ordering-no-top.h5' -TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/titu1994/DenseNet/releases/download/v2.0/DenseNet-40-12-Tensorflow-Backend-TF-dim-ordering-no-top.h5' +DENSENET_121_WEIGHTS_PATH = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-121-32.h5' +DENSENET_161_WEIGHTS_PATH = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-161-48.h5' +DENSENET_169_WEIGHTS_PATH = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-169-32.h5' +DENSENET_121_WEIGHTS_PATH_NO_TOP = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-121-32-no-top.h5' +DENSENET_161_WEIGHTS_PATH_NO_TOP = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-161-48-no-top.h5' +DENSENET_169_WEIGHTS_PATH_NO_TOP = r'https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-169-32-no-top.h5' -def DenseNet(input_shape=None, depth=40, nb_dense_block=3, growth_rate=12, nb_filter=16, nb_layers_per_block=-1, - bottleneck=False, reduction=0.0, dropout_rate=0.0, weight_decay=1E-4, - include_top=True, weights='cifar10', input_tensor=None, - classes=10, activation='softmax'): - '''Instantiate the DenseNet architecture, - optionally loading weights pre-trained - on CIFAR-10. Note that when using TensorFlow, - for best performance you should set - `image_data_format='channels_last'` in your Keras config - at ~/.keras/keras.json. - The model and the weights are compatible with both - TensorFlow and Theano. The dimension ordering - convention used by the model is the one - specified in your Keras config file. - # Arguments - input_shape: optional shape tuple, only to be specified - if `include_top` is False (otherwise the input shape - has to be `(32, 32, 3)` (with `channels_last` dim ordering) - or `(3, 32, 32)` (with `channels_first` dim ordering). - It should have exactly 3 inputs channels, - and width and height should be no smaller than 8. - E.g. `(200, 200, 3)` would be one valid value. - depth: number or layers in the DenseNet - nb_dense_block: number of dense blocks to add to end (generally = 3) - growth_rate: number of filters to add per dense block - nb_filter: initial number of filters. -1 indicates initial - number of filters is 2 * growth_rate - nb_layers_per_block: number of layers in each dense block. - Can be a -1, positive integer or a list. - If -1, calculates nb_layer_per_block from the network depth. - If positive integer, a set number of layers per dense block. - If list, nb_layer is used as provided. Note that list size must - be (nb_dense_block + 1) - bottleneck: flag to add bottleneck blocks in between dense blocks - reduction: reduction factor of transition blocks. - Note : reduction value is inverted to compute compression. - dropout_rate: dropout rate - weight_decay: weight decay factor - include_top: whether to include the fully-connected - layer at the top of the network. - weights: one of `None` (random initialization) or - 'cifar10' (pre-training on CIFAR-10).. - input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) - to use as image input for the model. - classes: optional number of classes to classify images - into, only to be specified if `include_top` is True, and - if no `weights` argument is specified. - activation: Type of activation at the top layer. Can be one of 'softmax' or 'sigmoid'. - Note that if sigmoid is used, classes must be 1. - # Returns - A Keras model instance. - ''' +def preprocess_input(x, data_format=None): + """Preprocesses a tensor encoding a batch of images. - if weights not in {'cifar10', None}: + # Arguments + x: input Numpy tensor, 4D. + data_format: data format of the image tensor. + + # Returns + Preprocessed tensor. + """ + x = _preprocess_input(x, data_format=data_format) + x *= 0.017 # scale values + return x + + +def DenseNet(input_shape=None, + depth=40, + nb_dense_block=3, + growth_rate=12, + nb_filter=-1, + nb_layers_per_block=-1, + bottleneck=False, + reduction=0.0, + dropout_rate=0.0, + weight_decay=1e-4, + subsample_initial_block=False, + include_top=True, + weights=None, + input_tensor=None, + pooling=None, + classes=10, + activation='softmax'): + '''Instantiate the DenseNet architecture. + + The model and the weights are compatible with both + TensorFlow and Theano. The dimension ordering + convention used by the model is the one + specified in your Keras config file. + + # Arguments + input_shape: optional shape tuple, only to be specified + if `include_top` is False (otherwise the input shape + has to be `(224, 224, 3)` (with `channels_last` dim ordering) + or `(3, 224, 224)` (with `channels_first` dim ordering). + It should have exactly 3 inputs channels, + and width and height should be no smaller than 8. + E.g. `(224, 224, 3)` would be one valid value. + depth: number or layers in the DenseNet + nb_dense_block: number of dense blocks to add to end + growth_rate: number of filters to add per dense block + nb_filter: initial number of filters. -1 indicates initial + number of filters will default to 2 * growth_rate + nb_layers_per_block: number of layers in each dense block. + Can be a -1, positive integer or a list. + If -1, calculates nb_layer_per_block from the network depth. + If positive integer, a set number of layers per dense block. + If list, nb_layer is used as provided. Note that list size must + be nb_dense_block + bottleneck: flag to add bottleneck blocks in between dense blocks + reduction: reduction factor of transition blocks. + Note : reduction value is inverted to compute compression. + dropout_rate: dropout rate + weight_decay: weight decay rate + subsample_initial_block: Changes model type to suit different datasets. + Should be set to True for ImageNet, and False for CIFAR datasets. + When set to True, the initial convolution will be strided and + adds a MaxPooling2D before the initial dense block. + include_top: whether to include the fully-connected + layer at the top of the network. + weights: one of `None` (random initialization) or + 'imagenet' (pre-training on ImageNet).. + input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) + to use as image input for the model. + pooling: Optional pooling mode for feature extraction + when `include_top` is `False`. + - `None` means that the output of the model + will be the 4D tensor output of the + last convolutional layer. + - `avg` means that global average pooling + will be applied to the output of the + last convolutional layer, and thus + the output of the model will be a + 2D tensor. + - `max` means that global max pooling will + be applied. + classes: optional number of classes to classify images + into, only to be specified if `include_top` is True, and + if no `weights` argument is specified. + activation: Type of activation at the top layer. Can be one of + 'softmax' or 'sigmoid'. Note that if sigmoid is used, + classes must be 1. + + # Returns + A Keras model instance. + + # Raises + ValueError: in case of invalid argument for `weights`, + or invalid input shape. + ''' + + if weights not in {'imagenet', None}: raise ValueError('The `weights` argument should be either ' - '`None` (random initialization) or `cifar10` ' - '(pre-training on CIFAR-10).') + '`None` (random initialization) or `imagenet` ' + '(pre-training on ImageNet).') - if weights == 'cifar10' and include_top and classes != 10: - raise ValueError('If using `weights` as CIFAR 10 with `include_top`' - ' as true, `classes` should be 10') + if weights == 'imagenet' and include_top and classes != 1000: + raise ValueError('If using `weights` as ImageNet with `include_top` ' + 'as true, `classes` should be 1000') if activation not in ['softmax', 'sigmoid']: raise ValueError('activation must be one of "softmax" or "sigmoid"') @@ -106,7 +202,7 @@ def DenseNet(input_shape=None, depth=40, nb_dense_block=3, growth_rate=12, nb_fi default_size=32, min_size=8, data_format=K.image_data_format(), - include_top=include_top) + require_flatten=include_top) if input_tensor is None: img_input = Input(shape=input_shape) @@ -117,8 +213,9 @@ def DenseNet(input_shape=None, depth=40, nb_dense_block=3, growth_rate=12, nb_fi img_input = input_tensor x = __create_dense_net(classes, img_input, include_top, depth, nb_dense_block, - growth_rate, nb_filter, nb_layers_per_block, bottleneck, reduction, - dropout_rate, weight_decay, activation) + growth_rate, nb_filter, nb_layers_per_block, bottleneck, + reduction, dropout_rate, weight_decay, subsample_initial_block, + pooling, activation) # Ensure that the model takes into account # any potential predecessors of `input_tensor`. @@ -130,47 +227,69 @@ def DenseNet(input_shape=None, depth=40, nb_dense_block=3, growth_rate=12, nb_fi model = Model(inputs, x, name='densenet') # load weights - if weights == 'cifar10': - if (depth == 40) and (nb_dense_block == 3) and (growth_rate == 12) and (nb_filter == 16) and \ - (bottleneck is False) and (reduction == 0.0) and (dropout_rate == 0.0) and (weight_decay == 1E-4): - # Default parameters match. Weights for this model exist: + if weights == 'imagenet': + weights_loaded = False - if K.image_data_format() == 'channels_first': - if include_top: - weights_path = get_file('densenet_40_12_th_dim_ordering_th_kernels.h5', - TH_WEIGHTS_PATH, - cache_subdir='models') - else: - weights_path = get_file('densenet_40_12_th_dim_ordering_th_kernels_no_top.h5', - TH_WEIGHTS_PATH_NO_TOP, - cache_subdir='models') - - model.load_weights(weights_path) - - if K.backend() == 'tensorflow': - warnings.warn('You are using the TensorFlow backend, yet you ' - 'are using the Theano ' - 'image dimension ordering convention ' - '(`image_data_format="channels_first"`). ' - 'For best performance, set ' - '`image_data_format="channels_last"` in ' - 'your Keras config ' - 'at ~/.keras/keras.json.') - convert_all_kernels_in_model(model) + if (depth == 121) and (nb_dense_block == 4) and (growth_rate == 32) and (nb_filter == 64) and \ + (bottleneck is True) and (reduction == 0.5) and (subsample_initial_block): + if include_top: + weights_path = get_file('DenseNet-BC-121-32.h5', + DENSENET_121_WEIGHTS_PATH, + cache_subdir='models', + md5_hash='a439dd41aa672aef6daba4ee1fd54abd') else: - if include_top: - weights_path = get_file('densenet_40_12_tf_dim_ordering_tf_kernels.h5', - TF_WEIGHTS_PATH, - cache_subdir='models') - else: - weights_path = get_file('densenet_40_12_tf_dim_ordering_tf_kernels_no_top.h5', - TF_WEIGHTS_PATH_NO_TOP, - cache_subdir='models') + weights_path = get_file('DenseNet-BC-121-32-no-top.h5', + DENSENET_121_WEIGHTS_PATH_NO_TOP, + cache_subdir='models', + md5_hash='55e62a6358af8a0af0eedf399b5aea99') + model.load_weights(weights_path, by_name=True) + weights_loaded = True - model.load_weights(weights_path) + if (depth == 161) and (nb_dense_block == 4) and (growth_rate == 48) and (nb_filter == 96) and \ + (bottleneck is True) and (reduction == 0.5) and (subsample_initial_block): + if include_top: + weights_path = get_file('DenseNet-BC-161-48.h5', + DENSENET_161_WEIGHTS_PATH, + cache_subdir='models', + md5_hash='6c326cf4fbdb57d31eff04333a23fcca') + else: + weights_path = get_file('DenseNet-BC-161-48-no-top.h5', + DENSENET_161_WEIGHTS_PATH_NO_TOP, + cache_subdir='models', + md5_hash='1a9476b79f6b7673acaa2769e6427b92') + model.load_weights(weights_path, by_name=True) + weights_loaded = True - if K.backend() == 'theano': - convert_all_kernels_in_model(model) + if (depth == 169) and (nb_dense_block == 4) and (growth_rate == 32) and (nb_filter == 64) and \ + (bottleneck is True) and (reduction == 0.5) and (subsample_initial_block): + if include_top: + weights_path = get_file('DenseNet-BC-169-32.h5', + DENSENET_169_WEIGHTS_PATH, + cache_subdir='models', + md5_hash='914869c361303d2e39dec640b4e606a6') + else: + weights_path = get_file('DenseNet-BC-169-32-no-top.h5', + DENSENET_169_WEIGHTS_PATH_NO_TOP, + cache_subdir='models', + md5_hash='89c19e8276cfd10585d5fadc1df6859e') + model.load_weights(weights_path, by_name=True) + weights_loaded = True + + if weights_loaded: + if K.backend() == 'theano': + convert_all_kernels_in_model(model) + + if K.image_data_format() == 'channels_first' and K.backend() == 'tensorflow': + warnings.warn('You are using the TensorFlow backend, yet you ' + 'are using the Theano ' + 'image data format convention ' + '(`image_data_format="channels_first"`). ' + 'For best performance, set ' + '`image_data_format="channels_last"` in ' + 'your Keras config ' + 'at ~/.keras/keras.json.') + + print("Weights for the model were loaded successfully") return model @@ -297,135 +416,297 @@ def DenseNetFCN(input_shape, nb_dense_block=5, growth_rate=16, nb_layers_per_blo return model -def __conv_block(ip, nb_filter, bottleneck=False, dropout_rate=None, weight_decay=1E-4): - ''' Apply BatchNorm, Relu, 3x3 Conv2D, optional bottleneck block and dropout - Args: - ip: Input keras tensor - nb_filter: number of filters - bottleneck: add bottleneck block +def DenseNetImageNet121(input_shape=None, + bottleneck=True, + reduction=0.5, + dropout_rate=0.0, + weight_decay=1e-4, + include_top=True, + weights='imagenet', + input_tensor=None, + pooling=None, + classes=1000, + activation='softmax'): + return DenseNet(input_shape, depth=121, nb_dense_block=4, growth_rate=32, nb_filter=64, + nb_layers_per_block=[6, 12, 24, 16], bottleneck=bottleneck, reduction=reduction, + dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True, + include_top=include_top, weights=weights, input_tensor=input_tensor, + pooling=pooling, classes=classes, activation=activation) + + +def DenseNetImageNet169(input_shape=None, + bottleneck=True, + reduction=0.5, + dropout_rate=0.0, + weight_decay=1e-4, + include_top=True, + weights='imagenet', + input_tensor=None, + pooling=None, + classes=1000, + activation='softmax'): + return DenseNet(input_shape, depth=169, nb_dense_block=4, growth_rate=32, nb_filter=64, + nb_layers_per_block=[6, 12, 32, 32], bottleneck=bottleneck, reduction=reduction, + dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True, + include_top=include_top, weights=weights, input_tensor=input_tensor, + pooling=pooling, classes=classes, activation=activation) + + +def DenseNetImageNet201(input_shape=None, + bottleneck=True, + reduction=0.5, + dropout_rate=0.0, + weight_decay=1e-4, + include_top=True, + weights=None, + input_tensor=None, + pooling=None, + classes=1000, + activation='softmax'): + return DenseNet(input_shape, depth=201, nb_dense_block=4, growth_rate=32, nb_filter=64, + nb_layers_per_block=[6, 12, 48, 32], bottleneck=bottleneck, reduction=reduction, + dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True, + include_top=include_top, weights=weights, input_tensor=input_tensor, + pooling=pooling, classes=classes, activation=activation) + + +def DenseNetImageNet264(input_shape=None, + bottleneck=True, + reduction=0.5, + dropout_rate=0.0, + weight_decay=1e-4, + include_top=True, + weights=None, + input_tensor=None, + pooling=None, + classes=1000, + activation='softmax'): + return DenseNet(input_shape, depth=201, nb_dense_block=4, growth_rate=32, nb_filter=64, + nb_layers_per_block=[6, 12, 64, 48], bottleneck=bottleneck, reduction=reduction, + dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True, + include_top=include_top, weights=weights, input_tensor=input_tensor, + pooling=pooling, classes=classes, activation=activation) + + +def DenseNetImageNet161(input_shape=None, + bottleneck=True, + reduction=0.5, + dropout_rate=0.0, + weight_decay=1e-4, + include_top=True, + weights='imagenet', + input_tensor=None, + pooling=None, + classes=1000, + activation='softmax'): + return DenseNet(input_shape, depth=161, nb_dense_block=4, growth_rate=48, nb_filter=96, + nb_layers_per_block=[6, 12, 36, 24], bottleneck=bottleneck, reduction=reduction, + dropout_rate=dropout_rate, weight_decay=weight_decay, subsample_initial_block=True, + include_top=include_top, weights=weights, input_tensor=input_tensor, + pooling=pooling, classes=classes, activation=activation) + + +def name_or_none(prefix, name): + return prefix + name if (prefix is not None and name is not None) else None + + +def __conv_block(ip, nb_filter, bottleneck=False, dropout_rate=None, weight_decay=1e-4, block_prefix=None): + ''' + Adds a convolution layer (with batch normalization and relu), + and optionally a bottleneck layer. + + # Arguments + ip: Input tensor + nb_filter: integer, the dimensionality of the output space + (i.e. the number output of filters in the convolution) + bottleneck: if True, adds a bottleneck convolution block dropout_rate: dropout rate weight_decay: weight decay factor - Returns: keras tensor with batch_norm, relu and convolution2d added (optional bottleneck) + block_prefix: str, for unique layer naming + + # Input shape + 4D tensor with shape: + `(samples, channels, rows, cols)` if data_format='channels_first' + or 4D tensor with shape: + `(samples, rows, cols, channels)` if data_format='channels_last'. + + # Output shape + 4D tensor with shape: + `(samples, filters, new_rows, new_cols)` if data_format='channels_first' + or 4D tensor with shape: + `(samples, new_rows, new_cols, filters)` if data_format='channels_last'. + `rows` and `cols` values might have changed due to stride. + + # Returns + output tensor of block ''' + with K.name_scope('ConvBlock'): + concat_axis = 1 if K.image_data_format() == 'channels_first' else -1 - concat_axis = 1 if K.image_data_format() == 'channels_first' else -1 + x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name=name_or_none(block_prefix, '_bn'))(ip) + x = Activation('relu')(x) - x = BatchNormalization(axis=concat_axis, gamma_regularizer=l2(weight_decay), - beta_regularizer=l2(weight_decay))(ip) - x = Activation('relu')(x) + if bottleneck: + inter_channel = nb_filter * 4 - if bottleneck: - inter_channel = nb_filter * 4 # Obtained from https://github.com/liuzhuang13/DenseNet/blob/master/densenet.lua - - x = Conv2D(inter_channel, (1, 1), kernel_initializer='he_uniform', padding='same', use_bias=False, - kernel_regularizer=l2(weight_decay))(x) + x = Conv2D(inter_channel, (1, 1), kernel_initializer='he_normal', padding='same', use_bias=False, + kernel_regularizer=l2(weight_decay), name=name_or_none(block_prefix, '_bottleneck_conv2D'))(x) + x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, + name=name_or_none(block_prefix, '_bottleneck_bn'))(x) + x = Activation('relu')(x) + x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_normal', padding='same', use_bias=False, + name=name_or_none(block_prefix, '_conv2D'))(x) if dropout_rate: x = Dropout(dropout_rate)(x) - x = BatchNormalization(axis=concat_axis, gamma_regularizer=l2(weight_decay), - beta_regularizer=l2(weight_decay))(x) + return x + + +def __dense_block(x, nb_layers, nb_filter, growth_rate, bottleneck=False, dropout_rate=None, + weight_decay=1e-4, grow_nb_filters=True, return_concat_list=False, block_prefix=None): + ''' + Build a dense_block where the output of each conv_block is fed + to subsequent ones + + # Arguments + x: input keras tensor + nb_layers: the number of conv_blocks to append to the model + nb_filter: integer, the dimensionality of the output space + (i.e. the number output of filters in the convolution) + growth_rate: growth rate of the dense block + bottleneck: if True, adds a bottleneck convolution block to + each conv_block + dropout_rate: dropout rate + weight_decay: weight decay factor + grow_nb_filters: if True, allows number of filters to grow + return_concat_list: set to True to return the list of + feature maps along with the actual output + block_prefix: str, for block unique naming + + # Return + If return_concat_list is True, returns a list of the output + keras tensor, the number of filters and a list of all the + dense blocks added to the keras tensor + + If return_concat_list is False, returns a list of the output + keras tensor and the number of filters + ''' + with K.name_scope('DenseBlock'): + concat_axis = 1 if K.image_data_format() == 'channels_first' else -1 + + x_list = [x] + + for i in range(nb_layers): + cb = __conv_block(x, growth_rate, bottleneck, dropout_rate, weight_decay, + block_prefix=name_or_none(block_prefix, '_%i' % i)) + x_list.append(cb) + + x = concatenate([x, cb], axis=concat_axis) + + if grow_nb_filters: + nb_filter += growth_rate + + if return_concat_list: + return x, nb_filter, x_list + else: + return x, nb_filter + + +def __transition_block(ip, nb_filter, compression=1.0, weight_decay=1e-4, block_prefix=None): + ''' + Adds a pointwise convolution layer (with batch normalization and relu), + and an average pooling layer. The number of output convolution filters + can be reduced by appropriately reducing the compression parameter. + + # Arguments + ip: input keras tensor + nb_filter: integer, the dimensionality of the output space + (i.e. the number output of filters in the convolution) + compression: calculated as 1 - reduction. Reduces the number + of feature maps in the transition block. + weight_decay: weight decay factor + block_prefix: str, for block unique naming + + # Input shape + 4D tensor with shape: + `(samples, channels, rows, cols)` if data_format='channels_first' + or 4D tensor with shape: + `(samples, rows, cols, channels)` if data_format='channels_last'. + + # Output shape + 4D tensor with shape: + `(samples, nb_filter * compression, rows / 2, cols / 2)` + if data_format='channels_first' + or 4D tensor with shape: + `(samples, rows / 2, cols / 2, nb_filter * compression)` + if data_format='channels_last'. + + # Returns + a keras tensor + ''' + with K.name_scope('Transition'): + concat_axis = 1 if K.image_data_format() == 'channels_first' else -1 + + x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name=name_or_none(block_prefix, '_bn'))(ip) x = Activation('relu')(x) + x = Conv2D(int(nb_filter * compression), (1, 1), kernel_initializer='he_normal', padding='same', + use_bias=False, kernel_regularizer=l2(weight_decay), name=name_or_none(block_prefix, '_conv2D'))(x) + x = AveragePooling2D((2, 2), strides=(2, 2))(x) - x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_uniform', padding='same', use_bias=False, - kernel_regularizer=l2(weight_decay))(x) - if dropout_rate: - x = Dropout(dropout_rate)(x) - - return x + return x -def __transition_block(ip, nb_filter, compression=1.0, dropout_rate=None, weight_decay=1E-4): - ''' Apply BatchNorm, Relu 1x1, Conv2D, optional compression, dropout and Maxpooling2D - Args: - ip: keras tensor - nb_filter: number of filters - compression: calculated as 1 - reduction. Reduces the number of feature maps - in the transition block. - dropout_rate: dropout rate +def __transition_up_block(ip, nb_filters, type='deconv', weight_decay=1E-4, block_prefix=None): + '''Adds an upsampling block. Upsampling operation relies on the the type parameter. + + # Arguments + ip: input keras tensor + nb_filters: integer, the dimensionality of the output space + (i.e. the number output of filters in the convolution) + type: can be 'upsampling', 'subpixel', 'deconv'. Determines + type of upsampling performed weight_decay: weight decay factor - Returns: keras tensor, after applying batch_norm, relu-conv, dropout, maxpool + block_prefix: str, for block unique naming + + # Input shape + 4D tensor with shape: + `(samples, channels, rows, cols)` if data_format='channels_first' + or 4D tensor with shape: + `(samples, rows, cols, channels)` if data_format='channels_last'. + + # Output shape + 4D tensor with shape: + `(samples, nb_filter, rows * 2, cols * 2)` if data_format='channels_first' + or 4D tensor with shape: + `(samples, rows * 2, cols * 2, nb_filter)` if data_format='channels_last'. + + # Returns + a keras tensor ''' + with K.name_scope('TransitionUp'): - concat_axis = 1 if K.image_data_format() == 'channels_first' else -1 - - x = BatchNormalization(axis=concat_axis, gamma_regularizer=l2(weight_decay), - beta_regularizer=l2(weight_decay))(ip) - x = Activation('relu')(x) - x = Conv2D(int(nb_filter * compression), (1, 1), kernel_initializer='he_uniform', padding='same', use_bias=False, - kernel_regularizer=l2(weight_decay))(x) - if dropout_rate: - x = Dropout(dropout_rate)(x) - x = AveragePooling2D((2, 2), strides=(2, 2))(x) - - return x - - -def __dense_block(x, nb_layers, nb_filter, growth_rate, bottleneck=False, dropout_rate=None, weight_decay=1E-4, - grow_nb_filters=True, return_concat_list=False): - ''' Build a dense_block where the output of each conv_block is fed to subsequent ones - Args: - x: keras tensor - nb_layers: the number of layers of conv_block to append to the model. - nb_filter: number of filters - growth_rate: growth rate - bottleneck: bottleneck block - dropout_rate: dropout rate - weight_decay: weight decay factor - grow_nb_filters: flag to decide to allow number of filters to grow - return_concat_list: return the list of feature maps along with the actual output - Returns: keras tensor with nb_layers of conv_block appended - ''' - - concat_axis = 1 if K.image_data_format() == 'channels_first' else -1 - - x_list = [x] - - for i in range(nb_layers): - conv_block = __conv_block(x, growth_rate, bottleneck, dropout_rate, weight_decay) - x_list.append(conv_block) - - x = concatenate([x, conv_block], axis=concat_axis) - - if grow_nb_filters: - nb_filter += growth_rate - - if return_concat_list: - return x, nb_filter, x_list - else: - return x, nb_filter - - -def __transition_up_block(ip, nb_filters, type='upsampling', weight_decay=1E-4): - ''' SubpixelConvolutional Upscaling (factor = 2) - Args: - ip: keras tensor - nb_filters: number of layers - type: can be 'upsampling', 'subpixel', 'deconv'. Determines type of upsampling performed - weight_decay: weight decay factor - Returns: keras tensor, after applying upsampling operation. - ''' - - if type == 'upsampling': - x = UpSampling2D()(ip) - elif type == 'subpixel': - x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', W_regularizer=l2(weight_decay), - use_bias=False, kernel_initializer='he_uniform')(ip) - x = SubPixelUpscaling(scale_factor=2)(x) - x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', W_regularizer=l2(weight_decay), - use_bias=False, kernel_initializer='he_uniform')(x) - else: - x = Conv2DTranspose(nb_filters, (3, 3), activation='relu', padding='same', strides=(2, 2), - kernel_initializer='he_uniform')(ip) - - return x + if type == 'upsampling': + x = UpSampling2D(name=name_or_none(block_prefix, '_upsampling'))(ip) + elif type == 'subpixel': + x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay), + use_bias=False, kernel_initializer='he_normal', name=name_or_none(block_prefix, '_conv2D'))(ip) + x = SubPixelUpscaling(scale_factor=2, name=name_or_none(block_prefix, '_subpixel'))(x) + x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay), + use_bias=False, kernel_initializer='he_normal', name=name_or_none(block_prefix, '_conv2D'))(x) + else: + x = Conv2DTranspose(nb_filters, (3, 3), activation='relu', padding='same', strides=(2, 2), + kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay), + name=name_or_none(block_prefix, '_conv2DT'))(ip) + return x def __create_dense_net(nb_classes, img_input, include_top, depth=40, nb_dense_block=3, growth_rate=12, nb_filter=-1, - nb_layers_per_block=-1, bottleneck=False, reduction=0.0, dropout_rate=None, weight_decay=1E-4, - activation='softmax'): + nb_layers_per_block=-1, bottleneck=False, reduction=0.0, dropout_rate=None, weight_decay=1e-4, + subsample_initial_block=False, pooling=None, activation='softmax'): ''' Build the DenseNet model - Args: + + # Arguments nb_classes: number of classes img_input: tuple of shape (channels, rows, columns) or (rows, columns, channels) include_top: flag to include the final Dense layer @@ -442,79 +723,120 @@ def __create_dense_net(nb_classes, img_input, include_top, depth=40, nb_dense_bl bottleneck: add bottleneck blocks reduction: reduction factor of transition blocks. Note : reduction value is inverted to compute compression dropout_rate: dropout rate - weight_decay: weight decay + weight_decay: weight decay rate + subsample_initial_block: Changes model type to suit different datasets. + Should be set to True for ImageNet, and False for CIFAR datasets. + When set to True, the initial convolution will be strided and + adds a MaxPooling2D before the initial dense block. + pooling: Optional pooling mode for feature extraction + when `include_top` is `False`. + - `None` means that the output of the model + will be the 4D tensor output of the + last convolutional layer. + - `avg` means that global average pooling + will be applied to the output of the + last convolutional layer, and thus + the output of the model will be a + 2D tensor. + - `max` means that global max pooling will + be applied. activation: Type of activation at the top layer. Can be one of 'softmax' or 'sigmoid'. Note that if sigmoid is used, classes must be 1. - Returns: keras tensor with nb_layers of conv_block appended + + # Returns + a keras tensor + + # Raises + ValueError: in case of invalid argument for `reduction` + or `nb_dense_block` ''' + with K.name_scope('DenseNet'): + concat_axis = 1 if K.image_data_format() == 'channels_first' else -1 - concat_axis = 1 if K.image_data_format() == 'channels_first' else -1 + if reduction != 0.0: + if not (reduction <= 1.0 and reduction > 0.0): + raise ValueError('`reduction` value must lie between 0.0 and 1.0') - assert (depth - 4) % 3 == 0, 'Depth must be 3 N + 4' - if reduction != 0.0: - assert reduction <= 1.0 and reduction > 0.0, 'reduction value must lie between 0.0 and 1.0' + # layers in each dense block + if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple: + nb_layers = list(nb_layers_per_block) # Convert tuple to list - # layers in each dense block - if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple: - nb_layers = list(nb_layers_per_block) # Convert tuple to list + if len(nb_layers) != (nb_dense_block): + raise ValueError('If `nb_dense_block` is a list, its length must match ' + 'the number of layers provided by `nb_layers`.') - assert len(nb_layers) == (nb_dense_block + 1), 'If list, nb_layer is used as provided. ' \ - 'Note that list size must be (nb_dense_block + 1)' - final_nb_layer = nb_layers[-1] - nb_layers = nb_layers[:-1] - else: - if nb_layers_per_block == -1: - count = int((depth - 4) / 3) - nb_layers = [count for _ in range(nb_dense_block)] - final_nb_layer = count + final_nb_layer = nb_layers[-1] + nb_layers = nb_layers[:-1] else: - final_nb_layer = nb_layers_per_block - nb_layers = [nb_layers_per_block] * nb_dense_block + if nb_layers_per_block == -1: + assert (depth - 4) % 3 == 0, 'Depth must be 3 N + 4 if nb_layers_per_block == -1' + count = int((depth - 4) / 3) + nb_layers = [count for _ in range(nb_dense_block)] + final_nb_layer = count + else: + final_nb_layer = nb_layers_per_block + nb_layers = [nb_layers_per_block] * nb_dense_block - if bottleneck: - nb_layers = [int(layer // 2) for layer in nb_layers] + # compute initial nb_filter if -1, else accept users initial nb_filter + if nb_filter <= 0: + nb_filter = 2 * growth_rate - # compute initial nb_filter if -1, else accept users initial nb_filter - if nb_filter <= 0: - nb_filter = 2 * growth_rate + # compute compression factor + compression = 1.0 - reduction - # compute compression factor - compression = 1.0 - reduction + # Initial convolution + if subsample_initial_block: + initial_kernel = (7, 7) + initial_strides = (2, 2) + else: + initial_kernel = (3, 3) + initial_strides = (1, 1) - # Initial convolution - x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_uniform', padding='same', name='initial_conv2D', - use_bias=False, kernel_regularizer=l2(weight_decay))(img_input) + x = Conv2D(nb_filter, initial_kernel, kernel_initializer='he_normal', padding='same', name='initial_conv2D', + strides=initial_strides, use_bias=False, kernel_regularizer=l2(weight_decay))(img_input) - # Add dense blocks - for block_idx in range(nb_dense_block - 1): - x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter, growth_rate, bottleneck=bottleneck, - dropout_rate=dropout_rate, weight_decay=weight_decay) - # add transition_block - x = __transition_block(x, nb_filter, compression=compression, dropout_rate=dropout_rate, - weight_decay=weight_decay) - nb_filter = int(nb_filter * compression) + if subsample_initial_block: + x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name='initial_bn')(x) + x = Activation('relu')(x) + x = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x) - # The last dense_block does not have a transition_block - x, nb_filter = __dense_block(x, final_nb_layer, nb_filter, growth_rate, bottleneck=bottleneck, - dropout_rate=dropout_rate, weight_decay=weight_decay) + # Add dense blocks + for block_idx in range(nb_dense_block - 1): + x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter, growth_rate, bottleneck=bottleneck, + dropout_rate=dropout_rate, weight_decay=weight_decay, + block_prefix='dense_%i' % block_idx) + # add transition_block + x = __transition_block(x, nb_filter, compression=compression, weight_decay=weight_decay, + block_prefix='tr_%i' % block_idx) + nb_filter = int(nb_filter * compression) - x = BatchNormalization(axis=concat_axis, gamma_regularizer=l2(weight_decay), - beta_regularizer=l2(weight_decay))(x) - x = Activation('relu')(x) - x = GlobalAveragePooling2D()(x) + # The last dense_block does not have a transition_block + x, nb_filter = __dense_block(x, final_nb_layer, nb_filter, growth_rate, bottleneck=bottleneck, + dropout_rate=dropout_rate, weight_decay=weight_decay, + block_prefix='dense_%i' % (nb_dense_block - 1)) - if include_top: - x = Dense(nb_classes, activation=activation, W_regularizer=l2(weight_decay), b_regularizer=l2(weight_decay))(x) + x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name='final_bn')(x) + x = Activation('relu')(x) - return x + if include_top: + x = GlobalAveragePooling2D()(x) + x = Dense(nb_classes, activation=activation)(x) + else: + if pooling == 'avg': + x = GlobalAveragePooling2D()(x) + if pooling == 'max': + x = GlobalMaxPooling2D()(x) + + return x def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5, growth_rate=12, - reduction=0.0, dropout_rate=None, weight_decay=1E-4, - nb_layers_per_block=4, nb_upsampling_conv=128, upsampling_type='deconv', - init_conv_filters=48, input_shape=None, activation='softmax'): - ''' Build the DenseNet model - Args: + reduction=0.0, dropout_rate=None, weight_decay=1e-4, + nb_layers_per_block=4, nb_upsampling_conv=128, upsampling_type='upsampling', + init_conv_filters=48, input_shape=None, activation='deconv'): + ''' Build the DenseNet-FCN model + + # Arguments nb_classes: number of classes img_input: tuple of shape (channels, rows, columns) or (rows, columns, channels) include_top: flag to include the final Dense layer @@ -534,104 +856,116 @@ def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5, input_shape: Only used for shape inference in fully convolutional networks. activation: Type of activation at the top layer. Can be one of 'softmax' or 'sigmoid'. Note that if sigmoid is used, classes must be 1. - Returns: keras tensor with nb_layers of conv_block appended + + # Returns + a keras tensor + + # Raises + ValueError: in case of invalid argument for `reduction`, + `nb_dense_block` or `nb_upsampling_conv`. ''' + with K.name_scope('DenseNetFCN'): + concat_axis = 1 if K.image_data_format() == 'channels_first' else -1 - concat_axis = 1 if K.image_data_format() == 'channels_first' else -1 - - if concat_axis == 1: # channels_first dim ordering - _, rows, cols = input_shape - else: - rows, cols, _ = input_shape - - if reduction != 0.0: - assert reduction <= 1.0 and reduction > 0.0, 'reduction value must lie between 0.0 and 1.0' - - # check if upsampling_conv has minimum number of filters - # minimum is set to 12, as at least 3 color channels are needed for correct upsampling - assert nb_upsampling_conv > 12 and nb_upsampling_conv % 4 == 0, 'Parameter `upsampling_conv` number of channels must ' \ - 'be a positive number divisible by 4 and greater ' \ - 'than 12' - - # layers in each dense block - if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple: - nb_layers = list(nb_layers_per_block) # Convert tuple to list - - assert len(nb_layers) == (nb_dense_block + 1), 'If list, nb_layer is used as provided. ' \ - 'Note that list size must be (nb_dense_block + 1)' - - bottleneck_nb_layers = nb_layers[-1] - rev_layers = nb_layers[::-1] - nb_layers.extend(rev_layers[1:]) - else: - bottleneck_nb_layers = nb_layers_per_block - nb_layers = [nb_layers_per_block] * (2 * nb_dense_block + 1) - - # compute compression factor - compression = 1.0 - reduction - - # Initial convolution - x = Conv2D(init_conv_filters, (3, 3), kernel_initializer='he_uniform', padding='same', name='initial_conv2D', - use_bias=False, kernel_regularizer=l2(weight_decay))(img_input) - - nb_filter = init_conv_filters - - skip_list = [] - - # Add dense blocks and transition down block - for block_idx in range(nb_dense_block): - x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter, growth_rate, - dropout_rate=dropout_rate, weight_decay=weight_decay) - - # Skip connection - skip_list.append(x) - - # add transition_block - x = __transition_block(x, nb_filter, compression=compression, dropout_rate=dropout_rate, - weight_decay=weight_decay) - - nb_filter = int(nb_filter * compression) # this is calculated inside transition_down_block - - # The last dense_block does not have a transition_down_block - # return the concatenated feature maps without the concatenation of the input - _, nb_filter, concat_list = __dense_block(x, bottleneck_nb_layers, nb_filter, growth_rate, - dropout_rate=dropout_rate, weight_decay=weight_decay, - return_concat_list=True) - - skip_list = skip_list[::-1] # reverse the skip list - - # Add dense blocks and transition up block - for block_idx in range(nb_dense_block): - n_filters_keep = growth_rate * nb_layers[nb_dense_block + block_idx] - - # upsampling block must upsample only the feature maps (concat_list[1:]), - # not the concatenation of the input with the feature maps (concat_list[0]. - l = concatenate(concat_list[1:], axis=concat_axis) - - t = __transition_up_block(l, nb_filters=n_filters_keep, type=upsampling_type) - - # concatenate the skip connection with the transition block - x = concatenate([t, skip_list[block_idx]], axis=concat_axis) - - # Dont allow the feature map size to grow in upsampling dense blocks - x_up, nb_filter, concat_list = __dense_block(x, nb_layers[nb_dense_block + block_idx + 1], nb_filter=growth_rate, - growth_rate=growth_rate, dropout_rate=dropout_rate, - weight_decay=weight_decay, - return_concat_list=True, grow_nb_filters=False) - - if include_top: - x = Conv2D(nb_classes, (1, 1), activation='linear', padding='same', kernel_regularizer=l2(weight_decay), - use_bias=False)(x_up) - - if K.image_data_format() == 'channels_first': - channel, row, col = input_shape + if concat_axis == 1: # channels_first dim ordering + _, rows, cols = input_shape else: - row, col, channel = input_shape + rows, cols, _ = input_shape - x = Reshape((row * col, nb_classes))(x) - x = Activation(activation)(x) - x = Reshape((row, col, nb_classes))(x) - else: - x = x_up + if reduction != 0.0: + if not (reduction <= 1.0 and reduction > 0.0): + raise ValueError('`reduction` value must lie between 0.0 and 1.0') - return x + # check if upsampling_conv has minimum number of filters + # minimum is set to 12, as at least 3 color channels are needed for correct upsampling + if not (nb_upsampling_conv > 12 and nb_upsampling_conv % 4 == 0): + raise ValueError('Parameter `nb_upsampling_conv` number of channels must ' + 'be a positive number divisible by 4 and greater than 12') + + # layers in each dense block + if type(nb_layers_per_block) is list or type(nb_layers_per_block) is tuple: + nb_layers = list(nb_layers_per_block) # Convert tuple to list + + if len(nb_layers) != (nb_dense_block + 1): + raise ValueError('If `nb_dense_block` is a list, its length must be ' + '(`nb_dense_block` + 1)') + + bottleneck_nb_layers = nb_layers[-1] + rev_layers = nb_layers[::-1] + nb_layers.extend(rev_layers[1:]) + else: + bottleneck_nb_layers = nb_layers_per_block + nb_layers = [nb_layers_per_block] * (2 * nb_dense_block + 1) + + # compute compression factor + compression = 1.0 - reduction + + # Initial convolution + x = Conv2D(init_conv_filters, (7, 7), kernel_initializer='he_normal', padding='same', name='initial_conv2D', + use_bias=False, kernel_regularizer=l2(weight_decay))(img_input) + x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name='initial_bn')(x) + x = Activation('relu')(x) + + nb_filter = init_conv_filters + + skip_list = [] + + # Add dense blocks and transition down block + for block_idx in range(nb_dense_block): + x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter, growth_rate, dropout_rate=dropout_rate, + weight_decay=weight_decay, block_prefix='dense_%i' % block_idx) + + # Skip connection + skip_list.append(x) + + # add transition_block + x = __transition_block(x, nb_filter, compression=compression, weight_decay=weight_decay, + block_prefix='tr_%i' % block_idx) + + nb_filter = int(nb_filter * compression) # this is calculated inside transition_down_block + + # The last dense_block does not have a transition_down_block + # return the concatenated feature maps without the concatenation of the input + _, nb_filter, concat_list = __dense_block(x, bottleneck_nb_layers, nb_filter, growth_rate, + dropout_rate=dropout_rate, weight_decay=weight_decay, + return_concat_list=True, + block_prefix='dense_%i' % nb_dense_block) + + skip_list = skip_list[::-1] # reverse the skip list + + # Add dense blocks and transition up block + for block_idx in range(nb_dense_block): + n_filters_keep = growth_rate * nb_layers[nb_dense_block + block_idx] + + # upsampling block must upsample only the feature maps (concat_list[1:]), + # not the concatenation of the input with the feature maps (concat_list[0]. + l = concatenate(concat_list[1:], axis=concat_axis) + + t = __transition_up_block(l, nb_filters=n_filters_keep, type=upsampling_type, weight_decay=weight_decay, + block_prefix='tr_up_%i' % block_idx) + + # concatenate the skip connection with the transition block + x = concatenate([t, skip_list[block_idx]], axis=concat_axis) + + # Dont allow the feature map size to grow in upsampling dense blocks + x_up, nb_filter, concat_list = __dense_block(x, nb_layers[nb_dense_block + block_idx + 1], + nb_filter=growth_rate, growth_rate=growth_rate, + dropout_rate=dropout_rate, weight_decay=weight_decay, + return_concat_list=True, grow_nb_filters=False, + block_prefix='dense_%i' % (nb_dense_block + 1 + block_idx)) + + if include_top: + x = Conv2D(nb_classes, (1, 1), activation='linear', padding='same', use_bias=False)(x_up) + + if K.image_data_format() == 'channels_first': + channel, row, col = input_shape + else: + row, col, channel = input_shape + + x = Reshape((row * col, nb_classes))(x) + x = Activation(activation)(x) + x = Reshape((row, col, nb_classes))(x) + else: + x = x_up + + return x diff --git a/keras_contrib/applications/nasnet.py b/keras_contrib/applications/nasnet.py new file mode 100644 index 0000000..89ebb98 --- /dev/null +++ b/keras_contrib/applications/nasnet.py @@ -0,0 +1,773 @@ +"""Collection of NASNet models + +The reference paper: + - [Learning Transferable Architectures for Scalable Image Recognition] + (https://arxiv.org/abs/1707.07012) + +The reference implementation: +1. TF Slim + - https://github.com/tensorflow/models/blob/master/research/slim/nets/ + nasnet/nasnet.py +2. TensorNets + - https://github.com/taehoonlee/tensornets/blob/master/tensornets/nasnets.py +3. Weights + - https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet +""" +from __future__ import print_function +from __future__ import absolute_import +from __future__ import division + +import warnings + +from keras.models import Model +from keras.layers import Input +from keras.layers import Activation +from keras.layers import Dense +from keras.layers import Dropout +from keras.layers import BatchNormalization +from keras.layers import MaxPooling2D +from keras.layers import AveragePooling2D +from keras.layers import GlobalAveragePooling2D +from keras.layers import GlobalMaxPooling2D +from keras.layers import Conv2D +from keras.layers import SeparableConv2D +from keras.layers import ZeroPadding2D +from keras.layers import Cropping2D +from keras.layers import concatenate +from keras.layers import add +from keras.regularizers import l2 +from keras.utils.data_utils import get_file +from keras.engine.topology import get_source_inputs +from keras.applications.imagenet_utils import _obtain_input_shape +from keras.applications.inception_v3 import preprocess_input +from keras.applications.imagenet_utils import decode_predictions +from keras import backend as K + +_BN_DECAY = 0.9997 +_BN_EPSILON = 1e-3 + +NASNET_MOBILE_WEIGHT_PATH = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.0/NASNet-mobile.h5" +NASNET_MOBILE_WEIGHT_PATH_NO_TOP = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.0/NASNet-mobile-no-top.h5" +NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.0/NASNet-auxiliary-mobile.h5" +NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY_NO_TOP = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.0/NASNet-auxiliary-mobile-no-top.h5" +NASNET_LARGE_WEIGHT_PATH = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.1/NASNet-large.h5" +NASNET_LARGE_WEIGHT_PATH_NO_TOP = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.1/NASNet-large-no-top.h5" +NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.1/NASNet-auxiliary-large.h5" +NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary_NO_TOP = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.1/NASNet-auxiliary-large-no-top.h5" + + +def NASNet(input_shape=None, + penultimate_filters=4032, + nb_blocks=6, + stem_filters=96, + skip_reduction=True, + use_auxiliary_branch=False, + filters_multiplier=2, + dropout=0.5, + weight_decay=5e-5, + include_top=True, + weights=None, + input_tensor=None, + pooling=None, + classes=1000, + default_size=None): + """Instantiates a NASNet architecture. + Note that only TensorFlow is supported for now, + therefore it only works with the data format + `image_data_format='channels_last'` in your Keras config + at `~/.keras/keras.json`. + + # Arguments + input_shape: optional shape tuple, only to be specified + if `include_top` is False (otherwise the input shape + has to be `(331, 331, 3)` for NASNetLarge or + `(224, 224, 3)` for NASNetMobile + It should have exactly 3 inputs channels, + and width and height should be no smaller than 32. + E.g. `(224, 224, 3)` would be one valid value. + penultimate_filters: number of filters in the penultimate layer. + NASNet models use the notation `NASNet (N @ P)`, where: + - N is the number of blocks + - P is the number of penultimate filters + nb_blocks: number of repeated blocks of the NASNet model. + NASNet models use the notation `NASNet (N @ P)`, where: + - N is the number of blocks + - P is the number of penultimate filters + stem_filters: number of filters in the initial stem block + skip_reduction: Whether to skip the reduction step at the tail + end of the network. Set to `False` for CIFAR models. + use_auxiliary_branch: Whether to use the auxiliary branch during + training or evaluation. + filters_multiplier: controls the width of the network. + - If `filters_multiplier` < 1.0, proportionally decreases the number + of filters in each layer. + - If `filters_multiplier` > 1.0, proportionally increases the number + of filters in each layer. + - If `filters_multiplier` = 1, default number of filters from the paper + are used at each layer. + dropout: dropout rate + weight_decay: l2 regularization weight + include_top: whether to include the fully-connected + layer at the top of the network. + weights: `None` (random initialization) or + `imagenet` (ImageNet weights) + input_tensor: optional Keras tensor (i.e. output of + `layers.Input()`) + to use as image input for the model. + pooling: Optional pooling mode for feature extraction + when `include_top` is `False`. + - `None` means that the output of the model + will be the 4D tensor output of the + last convolutional layer. + - `avg` means that global average pooling + will be applied to the output of the + last convolutional layer, and thus + the output of the model will be a + 2D tensor. + - `max` means that global max pooling will + be applied. + classes: optional number of classes to classify images + into, only to be specified if `include_top` is True, and + if no `weights` argument is specified. + default_size: specifies the default image size of the model + # Returns + A Keras model instance. + # Raises + ValueError: in case of invalid argument for `weights`, + or invalid input shape. + RuntimeError: If attempting to run this model with a + backend that does not support separable convolutions. + """ + if K.backend() != 'tensorflow': + raise RuntimeError('Only Tensorflow backend is currently supported, ' + 'as other backends do not support ' + 'separable convolution.') + + if weights not in {'imagenet', None}: + raise ValueError('The `weights` argument should be either ' + '`None` (random initialization) or `imagenet` ' + '(pre-training on ImageNet).') + + if weights == 'imagenet' and include_top and classes != 1000: + raise ValueError('If using `weights` as ImageNet with `include_top` ' + 'as true, `classes` should be 1000') + + if default_size is None: + default_size = 331 + + # Determine proper input shape and default size. + input_shape = _obtain_input_shape(input_shape, + default_size=default_size, + min_size=32, + data_format=K.image_data_format(), + require_flatten=include_top or weights) + + if K.image_data_format() != 'channels_last': + warnings.warn('The NASNet family of models is only available ' + 'for the input data format "channels_last" ' + '(width, height, channels). ' + 'However your settings specify the default ' + 'data format "channels_first" (channels, width, height).' + ' You should set `image_data_format="channels_last"` ' + 'in your Keras config located at ~/.keras/keras.json. ' + 'The model being returned right now will expect inputs ' + 'to follow the "channels_last" data format.') + K.set_image_data_format('channels_last') + old_data_format = 'channels_first' + else: + old_data_format = None + + if input_tensor is None: + img_input = Input(shape=input_shape) + else: + if not K.is_keras_tensor(input_tensor): + img_input = Input(tensor=input_tensor, shape=input_shape) + else: + img_input = input_tensor + + assert penultimate_filters % 24 == 0, "`penultimate_filters` needs to be divisible " \ + "by 24." + + channel_dim = 1 if K.image_data_format() == 'channels_first' else -1 + filters = penultimate_filters // 24 + + if not skip_reduction: + x = Conv2D(stem_filters, (3, 3), strides=(2, 2), padding='valid', use_bias=False, name='stem_conv1', + kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(img_input) + else: + x = Conv2D(stem_filters, (3, 3), strides=(1, 1), padding='same', use_bias=False, name='stem_conv1', + kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(img_input) + + x = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='stem_bn1')(x) + + p = None + if not skip_reduction: # imagenet / mobile mode + x, p = _reduction_A(x, p, filters // (filters_multiplier ** 2), weight_decay, id='stem_1') + x, p = _reduction_A(x, p, filters // filters_multiplier, weight_decay, id='stem_2') + + for i in range(nb_blocks): + x, p = _normal_A(x, p, filters, weight_decay, id='%d' % (i)) + + x, p0 = _reduction_A(x, p, filters * filters_multiplier, weight_decay, id='reduce_%d' % (nb_blocks)) + + p = p0 if not skip_reduction else p + + for i in range(nb_blocks): + x, p = _normal_A(x, p, filters * filters_multiplier, weight_decay, id='%d' % (nb_blocks + i + 1)) + + auxiliary_x = None + if not skip_reduction: # imagenet / mobile mode + if use_auxiliary_branch: + auxiliary_x = _add_auxiliary_head(x, classes, weight_decay) + + x, p0 = _reduction_A(x, p, filters * filters_multiplier ** 2, weight_decay, id='reduce_%d' % (2 * nb_blocks)) + + if skip_reduction: # CIFAR mode + if use_auxiliary_branch: + auxiliary_x = _add_auxiliary_head(x, classes, weight_decay) + + p = p0 if not skip_reduction else p + + for i in range(nb_blocks): + x, p = _normal_A(x, p, filters * filters_multiplier ** 2, weight_decay, id='%d' % (2 * nb_blocks + i + 1)) + + x = Activation('relu')(x) + + if include_top: + x = GlobalAveragePooling2D()(x) + x = Dropout(dropout)(x) + x = Dense(classes, activation='softmax', kernel_regularizer=l2(weight_decay), name='predictions')(x) + else: + if pooling == 'avg': + x = GlobalAveragePooling2D()(x) + elif pooling == 'max': + x = GlobalMaxPooling2D()(x) + + # Ensure that the model takes into account + # any potential predecessors of `input_tensor`. + if input_tensor is not None: + inputs = get_source_inputs(input_tensor) + else: + inputs = img_input + + # Create model. + if use_auxiliary_branch: + model = Model(inputs, [x, auxiliary_x], name='NASNet_with_auxiliary') + else: + model = Model(inputs, x, name='NASNet') + + # load weights + if weights == 'imagenet': + if default_size == 224: # mobile version + if include_top: + if use_auxiliary_branch: + weight_path = NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY + model_name = 'nasnet_mobile_with_aux.h5' + else: + weight_path = NASNET_MOBILE_WEIGHT_PATH + model_name = 'nasnet_mobile.h5' + else: + if use_auxiliary_branch: + weight_path = NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY_NO_TOP + model_name = 'nasnet_mobile_with_aux_no_top.h5' + else: + weight_path = NASNET_MOBILE_WEIGHT_PATH_NO_TOP + model_name = 'nasnet_mobile_no_top.h5' + + weights_file = get_file(model_name, weight_path, cache_subdir='models') + model.load_weights(weights_file, by_name=True) + + elif default_size == 331: # large version + if include_top: + if use_auxiliary_branch: + weight_path = NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary + model_name = 'nasnet_large_with_aux.h5' + else: + weight_path = NASNET_LARGE_WEIGHT_PATH + model_name = 'nasnet_large.h5' + else: + if use_auxiliary_branch: + weight_path = NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary_NO_TOP + model_name = 'nasnet_large_with_aux_no_top.h5' + else: + weight_path = NASNET_LARGE_WEIGHT_PATH_NO_TOP + model_name = 'nasnet_large_no_top.h5' + + weights_file = get_file(model_name, weight_path, cache_subdir='models') + model.load_weights(weights_file, by_name=True) + + else: + raise ValueError('ImageNet weights can only be loaded on NASNetLarge or NASNetMobile') + + if old_data_format: + K.set_image_data_format(old_data_format) + + return model + + +def NASNetLarge(input_shape=(331, 331, 3), + dropout=0.5, + weight_decay=5e-5, + use_auxiliary_branch=False, + include_top=True, + weights='imagenet', + input_tensor=None, + pooling=None, + classes=1000): + """Instantiates a NASNet architecture in ImageNet mode. + Note that only TensorFlow is supported for now, + therefore it only works with the data format + `image_data_format='channels_last'` in your Keras config + at `~/.keras/keras.json`. + + # Arguments + input_shape: optional shape tuple, only to be specified + if `include_top` is False (otherwise the input shape + has to be `(331, 331, 3)` for NASNetLarge. + It should have exactly 3 inputs channels, + and width and height should be no smaller than 32. + E.g. `(224, 224, 3)` would be one valid value. + use_auxiliary_branch: Whether to use the auxiliary branch during + training or evaluation. + dropout: dropout rate + weight_decay: l2 regularization weight + include_top: whether to include the fully-connected + layer at the top of the network. + weights: `None` (random initialization) or + `imagenet` (ImageNet weights) + input_tensor: optional Keras tensor (i.e. output of + `layers.Input()`) + to use as image input for the model. + pooling: Optional pooling mode for feature extraction + when `include_top` is `False`. + - `None` means that the output of the model + will be the 4D tensor output of the + last convolutional layer. + - `avg` means that global average pooling + will be applied to the output of the + last convolutional layer, and thus + the output of the model will be a + 2D tensor. + - `max` means that global max pooling will + be applied. + classes: optional number of classes to classify images + into, only to be specified if `include_top` is True, and + if no `weights` argument is specified. + default_size: specifies the default image size of the model + # Returns + A Keras model instance. + # Raises + ValueError: in case of invalid argument for `weights`, + or invalid input shape. + RuntimeError: If attempting to run this model with a + backend that does not support separable convolutions. + """ + global _BN_DECAY, _BN_EPSILON + _BN_DECAY = 0.9997 + _BN_EPSILON = 1e-3 + + return NASNet(input_shape, + penultimate_filters=4032, + nb_blocks=6, + stem_filters=96, + skip_reduction=False, + use_auxiliary_branch=use_auxiliary_branch, + filters_multiplier=2, + dropout=dropout, + weight_decay=weight_decay, + include_top=include_top, + weights=weights, + input_tensor=input_tensor, + pooling=pooling, + classes=classes, + default_size=331) + + +def NASNetMobile(input_shape=(224, 224, 3), + dropout=0.5, + weight_decay=4e-5, + use_auxiliary_branch=False, + include_top=True, + weights='imagenet', + input_tensor=None, + pooling=None, + classes=1000): + """Instantiates a NASNet architecture in Mobile ImageNet mode. + Note that only TensorFlow is supported for now, + therefore it only works with the data format + `image_data_format='channels_last'` in your Keras config + at `~/.keras/keras.json`. + + # Arguments + input_shape: optional shape tuple, only to be specified + if `include_top` is False (otherwise the input shape + has to be `(224, 224, 3)` for NASNetMobile + It should have exactly 3 inputs channels, + and width and height should be no smaller than 32. + E.g. `(224, 224, 3)` would be one valid value. + use_auxiliary_branch: Whether to use the auxiliary branch during + training or evaluation. + dropout: dropout rate + weight_decay: l2 regularization weight + include_top: whether to include the fully-connected + layer at the top of the network. + weights: `None` (random initialization) or + `imagenet` (ImageNet weights) + input_tensor: optional Keras tensor (i.e. output of + `layers.Input()`) + to use as image input for the model. + pooling: Optional pooling mode for feature extraction + when `include_top` is `False`. + - `None` means that the output of the model + will be the 4D tensor output of the + last convolutional layer. + - `avg` means that global average pooling + will be applied to the output of the + last convolutional layer, and thus + the output of the model will be a + 2D tensor. + - `max` means that global max pooling will + be applied. + classes: optional number of classes to classify images + into, only to be specified if `include_top` is True, and + if no `weights` argument is specified. + default_size: specifies the default image size of the model + # Returns + A Keras model instance. + # Raises + ValueError: in case of invalid argument for `weights`, + or invalid input shape. + RuntimeError: If attempting to run this model with a + backend that does not support separable convolutions. + """ + global _BN_DECAY, _BN_EPSILON + _BN_DECAY = 0.9997 + _BN_EPSILON = 1e-3 + + return NASNet(input_shape, + penultimate_filters=1056, + nb_blocks=4, + stem_filters=32, + skip_reduction=False, + use_auxiliary_branch=use_auxiliary_branch, + filters_multiplier=2, + dropout=dropout, + weight_decay=weight_decay, + include_top=include_top, + weights=weights, + input_tensor=input_tensor, + pooling=pooling, + classes=classes, + default_size=224) + + +def NASNetCIFAR(input_shape=(32, 32, 3), + dropout=0.0, + weight_decay=5e-4, + use_auxiliary_branch=False, + include_top=True, + weights=None, + input_tensor=None, + pooling=None, + classes=10): + """Instantiates a NASNet architecture in CIFAR mode. + Note that only TensorFlow is supported for now, + therefore it only works with the data format + `image_data_format='channels_last'` in your Keras config + at `~/.keras/keras.json`. + + # Arguments + input_shape: optional shape tuple, only to be specified + if `include_top` is False (otherwise the input shape + has to be `(32, 32, 3)` for NASNetMobile + It should have exactly 3 inputs channels, + and width and height should be no smaller than 32. + E.g. `(32, 32, 3)` would be one valid value. + use_auxiliary_branch: Whether to use the auxiliary branch during + training or evaluation. + dropout: dropout rate + weight_decay: l2 regularization weight + include_top: whether to include the fully-connected + layer at the top of the network. + weights: `None` (random initialization) or + `imagenet` (ImageNet weights) + input_tensor: optional Keras tensor (i.e. output of + `layers.Input()`) + to use as image input for the model. + pooling: Optional pooling mode for feature extraction + when `include_top` is `False`. + - `None` means that the output of the model + will be the 4D tensor output of the + last convolutional layer. + - `avg` means that global average pooling + will be applied to the output of the + last convolutional layer, and thus + the output of the model will be a + 2D tensor. + - `max` means that global max pooling will + be applied. + classes: optional number of classes to classify images + into, only to be specified if `include_top` is True, and + if no `weights` argument is specified. + default_size: specifies the default image size of the model + # Returns + A Keras model instance. + # Raises + ValueError: in case of invalid argument for `weights`, + or invalid input shape. + RuntimeError: If attempting to run this model with a + backend that does not support separable convolutions. + """ + global _BN_DECAY, _BN_EPSILON + _BN_DECAY = 0.9 + _BN_EPSILON = 1e-5 + + return NASNet(input_shape, + penultimate_filters=768, + nb_blocks=6, + stem_filters=32, + skip_reduction=True, + use_auxiliary_branch=use_auxiliary_branch, + filters_multiplier=2, + dropout=dropout, + weight_decay=weight_decay, + include_top=include_top, + weights=weights, + input_tensor=input_tensor, + pooling=pooling, + classes=classes, + default_size=224) + + +def _separable_conv_block(ip, filters, kernel_size=(3, 3), strides=(1, 1), weight_decay=5e-5, id=None): + '''Adds 2 blocks of [relu-separable conv-batchnorm] + + # Arguments: + ip: input tensor + filters: number of output filters per layer + kernel_size: kernel size of separable convolutions + strides: strided convolution for downsampling + weight_decay: l2 regularization weight + id: string id + + # Returns: + a Keras tensor + ''' + channel_dim = 1 if K.image_data_format() == 'channels_first' else -1 + + with K.name_scope('separable_conv_block_%s' % id): + x = Activation('relu')(ip) + x = SeparableConv2D(filters, kernel_size, strides=strides, name='separable_conv_1_%s' % id, + padding='same', use_bias=False, kernel_initializer='he_normal', + kernel_regularizer=l2(weight_decay))(x) + x = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name="separable_conv_1_bn_%s" % (id))(x) + x = Activation('relu')(x) + x = SeparableConv2D(filters, kernel_size, name='separable_conv_2_%s' % id, + padding='same', use_bias=False, kernel_initializer='he_normal', + kernel_regularizer=l2(weight_decay))(x) + x = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name="separable_conv_2_bn_%s" % (id))(x) + return x + + +def _adjust_block(p, ip, filters, weight_decay=5e-5, id=None): + ''' + Adjusts the input `p` to match the shape of the `input` + or situations where the output number of filters needs to + be changed + + # Arguments: + p: input tensor which needs to be modified + ip: input tensor whose shape needs to be matched + filters: number of output filters to be matched + weight_decay: l2 regularization weight + id: string id + + # Returns: + an adjusted Keras tensor + ''' + channel_dim = 1 if K.image_data_format() == 'channels_first' else -1 + img_dim = 2 if K.image_data_format() == 'channels_first' else -2 + + with K.name_scope('adjust_block'): + if p is None: + p = ip + + elif p._keras_shape[img_dim] != ip._keras_shape[img_dim]: + with K.name_scope('adjust_reduction_block_%s' % id): + p = Activation('relu', name='adjust_relu_1_%s' % id)(p) + + p1 = AveragePooling2D((1, 1), strides=(2, 2), padding='valid', name='adjust_avg_pool_1_%s' % id)(p) + p1 = Conv2D(filters // 2, (1, 1), padding='same', use_bias=False, kernel_regularizer=l2(weight_decay), + name='adjust_conv_1_%s' % id, kernel_initializer='he_normal')(p1) + + p2 = ZeroPadding2D(padding=((0, 1), (0, 1)))(p) + p2 = Cropping2D(cropping=((1, 0), (1, 0)))(p2) + p2 = AveragePooling2D((1, 1), strides=(2, 2), padding='valid', name='adjust_avg_pool_2_%s' % id)(p2) + p2 = Conv2D(filters // 2, (1, 1), padding='same', use_bias=False, kernel_regularizer=l2(weight_decay), + name='adjust_conv_2_%s' % id, kernel_initializer='he_normal')(p2) + + p = concatenate([p1, p2], axis=channel_dim) + p = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='adjust_bn_%s' % id)(p) + + elif p._keras_shape[channel_dim] != filters: + with K.name_scope('adjust_projection_block_%s' % id): + p = Activation('relu')(p) + p = Conv2D(filters, (1, 1), strides=(1, 1), padding='same', name='adjust_conv_projection_%s' % id, + use_bias=False, kernel_regularizer=l2(weight_decay), kernel_initializer='he_normal')(p) + p = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='adjust_bn_%s' % id)(p) + return p + + +def _normal_A(ip, p, filters, weight_decay=5e-5, id=None): + '''Adds a Normal cell for NASNet-A (Fig. 4 in the paper) + + # Arguments: + ip: input tensor `x` + p: input tensor `p` + filters: number of output filters + weight_decay: l2 regularization weight + id: string id + + # Returns: + a Keras tensor + ''' + channel_dim = 1 if K.image_data_format() == 'channels_first' else -1 + + with K.name_scope('normal_A_block_%s' % id): + p = _adjust_block(p, ip, filters, weight_decay, id) + + h = Activation('relu')(ip) + h = Conv2D(filters, (1, 1), strides=(1, 1), padding='same', name='normal_conv_1_%s' % id, + use_bias=False, kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(h) + h = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='normal_bn_1_%s' % id)(h) + + with K.name_scope('block_1'): + x1_1 = _separable_conv_block(h, filters, kernel_size=(5, 5), weight_decay=weight_decay, + id='normal_left1_%s' % id) + x1_2 = _separable_conv_block(p, filters, weight_decay=weight_decay, id='normal_right1_%s' % id) + x1 = add([x1_1, x1_2], name='normal_add_1_%s' % id) + + with K.name_scope('block_2'): + x2_1 = _separable_conv_block(p, filters, (5, 5), weight_decay=weight_decay, id='normal_left2_%s' % id) + x2_2 = _separable_conv_block(p, filters, (3, 3), weight_decay=weight_decay, id='normal_right2_%s' % id) + x2 = add([x2_1, x2_2], name='normal_add_2_%s' % id) + + with K.name_scope('block_3'): + x3 = AveragePooling2D((3, 3), strides=(1, 1), padding='same', name='normal_left3_%s' % (id))(h) + x3 = add([x3, p], name='normal_add_3_%s' % id) + + with K.name_scope('block_4'): + x4_1 = AveragePooling2D((3, 3), strides=(1, 1), padding='same', name='normal_left4_%s' % (id))(p) + x4_2 = AveragePooling2D((3, 3), strides=(1, 1), padding='same', name='normal_right4_%s' % (id))(p) + x4 = add([x4_1, x4_2], name='normal_add_4_%s' % id) + + with K.name_scope('block_5'): + x5 = _separable_conv_block(h, filters, weight_decay=weight_decay, id='normal_left5_%s' % id) + x5 = add([x5, h], name='normal_add_5_%s' % id) + + x = concatenate([p, x1, x2, x3, x4, x5], axis=channel_dim, name='normal_concat_%s' % id) + return x, ip + + +def _reduction_A(ip, p, filters, weight_decay=5e-5, id=None): + '''Adds a Reduction cell for NASNet-A (Fig. 4 in the paper) + + # Arguments: + ip: input tensor `x` + p: input tensor `p` + filters: number of output filters + weight_decay: l2 regularization weight + id: string id + + # Returns: + a Keras tensor + ''' + """""" + channel_dim = 1 if K.image_data_format() == 'channels_first' else -1 + + with K.name_scope('reduction_A_block_%s' % id): + p = _adjust_block(p, ip, filters, weight_decay, id) + + h = Activation('relu')(ip) + h = Conv2D(filters, (1, 1), strides=(1, 1), padding='same', name='reduction_conv_1_%s' % id, + use_bias=False, kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(h) + h = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='reduction_bn_1_%s' % id)(h) + + with K.name_scope('block_1'): + x1_1 = _separable_conv_block(h, filters, (5, 5), strides=(2, 2), weight_decay=weight_decay, + id='reduction_left1_%s' % id) + x1_2 = _separable_conv_block(p, filters, (7, 7), strides=(2, 2), weight_decay=weight_decay, + id='reduction_1_%s' % id) + x1 = add([x1_1, x1_2], name='reduction_add_1_%s' % id) + + with K.name_scope('block_2'): + x2_1 = MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='reduction_left2_%s' % id)(h) + x2_2 = _separable_conv_block(p, filters, (7, 7), strides=(2, 2), weight_decay=weight_decay, + id='reduction_right2_%s' % id) + x2 = add([x2_1, x2_2], name='reduction_add_2_%s' % id) + + with K.name_scope('block_3'): + x3_1 = AveragePooling2D((3, 3), strides=(2, 2), padding='same', name='reduction_left3_%s' % id)(h) + x3_2 = _separable_conv_block(p, filters, (5, 5), strides=(2, 2), weight_decay=weight_decay, + id='reduction_right3_%s' % id) + x3 = add([x3_1, x3_2], name='reduction_add3_%s' % id) + + with K.name_scope('block_4'): + x4 = AveragePooling2D((3, 3), strides=(1, 1), padding='same', name='reduction_left4_%s' % id)(x1) + x4 = add([x2, x4]) + + with K.name_scope('block_5'): + x5_1 = _separable_conv_block(x1, filters, (3, 3), weight_decay=weight_decay, id='reduction_left4_%s' % id) + x5_2 = MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='reduction_right5_%s' % id)(h) + x5 = add([x5_1, x5_2], name='reduction_add4_%s' % id) + + x = concatenate([x2, x3, x4, x5], axis=channel_dim, name='reduction_concat_%s' % id) + return x, ip + + +def _add_auxiliary_head(x, classes, weight_decay): + '''Adds an auxiliary head for training the model + + From section A.7 "Training of ImageNet models" of the paper, all NASNet models are + trained using an auxiliary classifier around 2/3 of the depth of the network, with + a loss weight of 0.4 + + # Arguments + x: input tensor + classes: number of output classes + weight_decay: l2 regularization weight + + # Returns + a keras Tensor + ''' + img_height = 1 if K.image_data_format() == 'channels_last' else 2 + img_width = 2 if K.image_data_format() == 'channels_last' else 3 + channel_axis = 1 if K.image_data_format() == 'channels_first' else -1 + + with K.name_scope('auxiliary_branch'): + auxiliary_x = Activation('relu')(x) + auxiliary_x = AveragePooling2D((5, 5), strides=(3, 3), padding='valid', name='aux_pool')(auxiliary_x) + auxiliary_x = Conv2D(128, (1, 1), padding='same', use_bias=False, name='aux_conv_projection', + kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(auxiliary_x) + auxiliary_x = BatchNormalization(axis=channel_axis, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='aux_bn_projection')(auxiliary_x) + auxiliary_x = Activation('relu')(auxiliary_x) + + auxiliary_x = Conv2D(768, (auxiliary_x._keras_shape[img_height], auxiliary_x._keras_shape[img_width]), + padding='valid', use_bias=False, kernel_initializer='he_normal', + kernel_regularizer=l2(weight_decay), name='aux_conv_reduction')(auxiliary_x) + auxiliary_x = BatchNormalization(axis=channel_axis, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='aux_bn_reduction')(auxiliary_x) + auxiliary_x = Activation('relu')(auxiliary_x) + + auxiliary_x = GlobalAveragePooling2D()(auxiliary_x) + auxiliary_x = Dense(classes, activation='softmax', kernel_regularizer=l2(weight_decay), + name='aux_predictions')(auxiliary_x) + return auxiliary_x diff --git a/keras_contrib/applications/resnet.py b/keras_contrib/applications/resnet.py new file mode 100644 index 0000000..743922c --- /dev/null +++ b/keras_contrib/applications/resnet.py @@ -0,0 +1,454 @@ +"""ResNet v1, v2, and segmentation models for Keras. + +# Reference + +- [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) +- [Identity Mappings in Deep Residual Networks](https://arxiv.org/abs/1603.05027) + +Reference material for extended functionality: + +- [ResNeXt](https://arxiv.org/abs/1611.05431) for Tiny ImageNet support. +- [Dilated Residual Networks](https://arxiv.org/pdf/1705.09914) for segmentation support. +- [Deep Residual Learning for Instrument Segmentation in Robotic Surgery](https://arxiv.org/abs/1703.08580) + for segmentation support. + +Implementation Adapted from: github.com/raghakot/keras-resnet +""" +from __future__ import division + +import six +from keras.models import Model +from keras.layers import Input +from keras.layers import Activation +from keras.layers import Reshape +from keras.layers import Dense +from keras.layers import Flatten +from keras.layers import Conv2D +from keras.layers import MaxPooling2D +from keras.layers import AveragePooling2D +from keras.layers.pooling import GlobalAveragePooling2D +from keras.layers import GlobalMaxPooling2D +from keras.layers import GlobalAveragePooling2D +from keras.layers import Dropout +from keras.layers.merge import add +from keras.layers.normalization import BatchNormalization +from keras.regularizers import l2 +from keras import backend as K +from keras.applications.imagenet_utils import _obtain_input_shape + + +def _bn_relu(x, bn_name=None, relu_name=None): + """Helper to build a BN -> relu block + """ + norm = BatchNormalization(axis=CHANNEL_AXIS, name=bn_name)(x) + return Activation("relu", name=relu_name)(norm) + + +def _conv_bn_relu(**conv_params): + """Helper to build a conv -> BN -> relu residual unit activation function. + This is the original ResNet v1 scheme in https://arxiv.org/abs/1512.03385 + """ + filters = conv_params["filters"] + kernel_size = conv_params["kernel_size"] + strides = conv_params.setdefault("strides", (1, 1)) + dilation_rate = conv_params.setdefault("dilation_rate", (1, 1)) + conv_name = conv_params.setdefault("conv_name", None) + bn_name = conv_params.setdefault("bn_name", None) + relu_name = conv_params.setdefault("relu_name", None) + kernel_initializer = conv_params.setdefault("kernel_initializer", "he_normal") + padding = conv_params.setdefault("padding", "same") + kernel_regularizer = conv_params.setdefault("kernel_regularizer", l2(1.e-4)) + + def f(x): + x = Conv2D(filters=filters, kernel_size=kernel_size, + strides=strides, padding=padding, + dilation_rate=dilation_rate, + kernel_initializer=kernel_initializer, + kernel_regularizer=kernel_regularizer, + name=conv_name)(x) + return _bn_relu(x, bn_name=bn_name, relu_name=relu_name) + + return f + + +def _bn_relu_conv(**conv_params): + """Helper to build a BN -> relu -> conv residual unit with full pre-activation function. + This is the ResNet v2 scheme proposed in http://arxiv.org/pdf/1603.05027v2.pdf + """ + filters = conv_params["filters"] + kernel_size = conv_params["kernel_size"] + strides = conv_params.setdefault("strides", (1, 1)) + dilation_rate = conv_params.setdefault("dilation_rate", (1, 1)) + conv_name = conv_params.setdefault("conv_name", None) + bn_name = conv_params.setdefault("bn_name", None) + relu_name = conv_params.setdefault("relu_name", None) + kernel_initializer = conv_params.setdefault("kernel_initializer", "he_normal") + padding = conv_params.setdefault("padding", "same") + kernel_regularizer = conv_params.setdefault("kernel_regularizer", l2(1.e-4)) + + def f(x): + activation = _bn_relu(x, bn_name=bn_name, relu_name=relu_name) + return Conv2D(filters=filters, kernel_size=kernel_size, + strides=strides, padding=padding, + dilation_rate=dilation_rate, + kernel_initializer=kernel_initializer, + kernel_regularizer=kernel_regularizer, + name=conv_name)(activation) + + return f + + +def _shortcut(input_feature, residual, conv_name_base=None, bn_name_base=None): + """Adds a shortcut between input and residual block and merges them with "sum" + """ + # Expand channels of shortcut to match residual. + # Stride appropriately to match residual (width, height) + # Should be int if network architecture is correctly configured. + input_shape = K.int_shape(input_feature) + residual_shape = K.int_shape(residual) + stride_width = int(round(input_shape[ROW_AXIS] / residual_shape[ROW_AXIS])) + stride_height = int(round(input_shape[COL_AXIS] / residual_shape[COL_AXIS])) + equal_channels = input_shape[CHANNEL_AXIS] == residual_shape[CHANNEL_AXIS] + + shortcut = input_feature + # 1 X 1 conv if shape is different. Else identity. + if stride_width > 1 or stride_height > 1 or not equal_channels: + print('reshaping via a convolution...') + if conv_name_base is not None: + conv_name_base = conv_name_base + '1' + shortcut = Conv2D(filters=residual_shape[CHANNEL_AXIS], + kernel_size=(1, 1), + strides=(stride_width, stride_height), + padding="valid", + kernel_initializer="he_normal", + kernel_regularizer=l2(0.0001), + name=conv_name_base)(input_feature) + if bn_name_base is not None: + bn_name_base = bn_name_base + '1' + shortcut = BatchNormalization(axis=CHANNEL_AXIS, name=bn_name_base)(shortcut) + + return add([shortcut, residual]) + + +def _residual_block(block_function, filters, blocks, stage, + transition_strides=None, transition_dilation_rates=None, + dilation_rates=(1, 1), is_first_layer=False, dropout=None, + residual_unit=_bn_relu_conv): + """Builds a residual block with repeating bottleneck blocks. + + stage: integer, current stage label, used for generating layer names + blocks: number of blocks 'a','b'..., current block label, used for generating layer names + transition_strides: a list of tuples for the strides of each transition + transition_dilation_rates: a list of tuples for the dilation rate of each transition + """ + if transition_dilation_rates is None: + transition_dilation_rates = [(1, 1)] * blocks + if transition_strides is None: + transition_strides = [(1, 1)] * blocks + + def f(x): + for i in range(blocks): + x = block_function(filters=filters, stage=stage, block=i, + transition_strides=transition_strides[i], + dilation_rate=dilation_rates[i], + is_first_block_of_first_layer=(is_first_layer and i == 0), + dropout=dropout, + residual_unit=residual_unit)(x) + return x + + return f + + +def _block_name_base(stage, block): + """Get the convolution name base and batch normalization name base defined by stage and block. + + If there are less than 26 blocks they will be labeled 'a', 'b', 'c' to match the paper and keras + and beyond 26 blocks they will simply be numbered. + """ + if block < 27: + block = '%c' % (block + 97) # 97 is the ascii number for lowercase 'a' + conv_name_base = 'res' + str(stage) + block + '_branch' + bn_name_base = 'bn' + str(stage) + block + '_branch' + return conv_name_base, bn_name_base + + +def basic_block(filters, stage, block, transition_strides=(1, 1), + dilation_rate=(1, 1), is_first_block_of_first_layer=False, dropout=None, + residual_unit=_bn_relu_conv): + """Basic 3 X 3 convolution blocks for use on resnets with layers <= 34. + Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf + """ + def f(input_features): + conv_name_base, bn_name_base = _block_name_base(stage, block) + if is_first_block_of_first_layer: + # don't repeat bn->relu since we just did bn->relu->maxpool + x = Conv2D(filters=filters, kernel_size=(3, 3), + strides=transition_strides, + dilation_rate=dilation_rate, + padding="same", + kernel_initializer="he_normal", + kernel_regularizer=l2(1e-4), + name=conv_name_base + '2a')(input_features) + else: + x = residual_unit(filters=filters, kernel_size=(3, 3), + strides=transition_strides, + dilation_rate=dilation_rate, + conv_name_base=conv_name_base + '2a', + bn_name_base=bn_name_base + '2a')(input_features) + + if dropout is not None: + x = Dropout(dropout)(x) + + x = residual_unit(filters=filters, kernel_size=(3, 3), + conv_name_base=conv_name_base + '2b', + bn_name_base=bn_name_base + '2b')(x) + + return _shortcut(input_features, x) + + return f + + +def bottleneck(filters, stage, block, transition_strides=(1, 1), + dilation_rate=(1, 1), is_first_block_of_first_layer=False, dropout=None, + residual_unit=_bn_relu_conv): + """Bottleneck architecture for > 34 layer resnet. + Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf + + Returns: + A final conv layer of filters * 4 + """ + def f(input_feature): + conv_name_base, bn_name_base = _block_name_base(stage, block) + if is_first_block_of_first_layer: + # don't repeat bn->relu since we just did bn->relu->maxpool + x = Conv2D(filters=filters, kernel_size=(1, 1), + strides=transition_strides, + dilation_rate=dilation_rate, + padding="same", + kernel_initializer="he_normal", + kernel_regularizer=l2(1e-4), + name=conv_name_base + '2a')(input_feature) + else: + x = residual_unit(filters=filters, kernel_size=(1, 1), + strides=transition_strides, + dilation_rate=dilation_rate, + conv_name_base=conv_name_base + '2a', + bn_name_base=bn_name_base + '2a')(input_feature) + + if dropout is not None: + x = Dropout(dropout)(x) + + x = residual_unit(filters=filters, kernel_size=(3, 3), + conv_name_base=conv_name_base + '2b', + bn_name_base=bn_name_base + '2b')(x) + + if dropout is not None: + x = Dropout(dropout)(x) + + x = residual_unit(filters=filters * 4, kernel_size=(1, 1), + conv_name_base=conv_name_base + '2c', + bn_name_base=bn_name_base + '2c')(x) + + return _shortcut(input_feature, x) + + return f + + +def _handle_dim_ordering(): + global ROW_AXIS + global COL_AXIS + global CHANNEL_AXIS + if K.image_data_format() == 'channels_last': + ROW_AXIS = 1 + COL_AXIS = 2 + CHANNEL_AXIS = 3 + else: + CHANNEL_AXIS = 1 + ROW_AXIS = 2 + COL_AXIS = 3 + + +def _string_to_function(identifier): + if isinstance(identifier, six.string_types): + res = globals().get(identifier) + if not res: + raise ValueError('Invalid {}'.format(identifier)) + return res + return identifier + + +def ResNet(input_shape=None, classes=10, block='bottleneck', residual_unit='v2', repetitions=None, + initial_filters=64, activation='softmax', include_top=True, input_tensor=None, dropout=None, + transition_dilation_rate=(1, 1), initial_strides=(2, 2), initial_kernel_size=(7, 7), + initial_pooling='max', final_pooling=None, top='classification'): + """Builds a custom ResNet like architecture. Defaults to ResNet50 v2. + + Args: + input_shape: optional shape tuple, only to be specified + if `include_top` is False (otherwise the input shape + has to be `(224, 224, 3)` (with `channels_last` dim ordering) + or `(3, 224, 224)` (with `channels_first` dim ordering). + It should have exactly 3 inputs channels, + and width and height should be no smaller than 8. + E.g. `(224, 224, 3)` would be one valid value. + classes: The number of outputs at final softmax layer + block: The block function to use. This is either `'basic'` or `'bottleneck'`. + The original paper used `basic` for layers < 50. + repetitions: Number of repetitions of various block units. + At each block unit, the number of filters are doubled and the input size is halved. + Default of None implies the ResNet50v2 values of [3, 4, 6, 3]. + transition_dilation_rate: Used for pixel-wise prediction tasks such as image segmentation. + residual_unit: the basic residual unit, 'v1' for conv bn relu, 'v2' for bn relu conv. + See [Identity Mappings in Deep Residual Networks](https://arxiv.org/abs/1603.05027) + for details. + dropout: None for no dropout, otherwise rate of dropout from 0 to 1. + Based on [Wide Residual Networks.(https://arxiv.org/pdf/1605.07146) paper. + transition_dilation_rate: Dilation rate for transition layers. For semantic + segmentation of images use a dilation rate of (2, 2). + initial_strides: Stride of the very first residual unit and MaxPooling2D call, + with default (2, 2), set to (1, 1) for small images like cifar. + initial_kernel_size: kernel size of the very first convolution, (7, 7) for imagenet + and (3, 3) for small image datasets like tiny imagenet and cifar. + See [ResNeXt](https://arxiv.org/abs/1611.05431) paper for details. + initial_pooling: Determine if there will be an initial pooling layer, + 'max' for imagenet and None for small image datasets. + See [ResNeXt](https://arxiv.org/abs/1611.05431) paper for details. + final_pooling: Optional pooling mode for feature extraction at the final model layer + when `include_top` is `False`. + - `None` means that the output of the model + will be the 4D tensor output of the + last convolutional layer. + - `avg` means that global average pooling + will be applied to the output of the + last convolutional layer, and thus + the output of the model will be a + 2D tensor. + - `max` means that global max pooling will + be applied. + top: Defines final layers to evaluate based on a specific problem type. Options are + 'classification' for ImageNet style problems, 'segmentation' for problems like + the Pascal VOC dataset, and None to exclude these layers entirely. + + Returns: + The keras `Model`. + """ + if activation not in ['softmax', 'sigmoid', None]: + raise ValueError('activation must be one of "softmax", "sigmoid", or None') + if activation == 'sigmoid' and classes != 1: + raise ValueError('sigmoid activation can only be used when classes = 1') + if repetitions is None: + repetitions = [3, 4, 6, 3] + # Determine proper input shape + input_shape = _obtain_input_shape(input_shape, + default_size=32, + min_size=8, + data_format=K.image_data_format(), + require_flatten=include_top) + _handle_dim_ordering() + if len(input_shape) != 3: + raise Exception("Input shape should be a tuple (nb_channels, nb_rows, nb_cols)") + + if block == 'basic': + block_fn = basic_block + elif block == 'bottleneck': + block_fn = bottleneck + elif isinstance(block, six.string_types): + block_fn = _string_to_function(block) + else: + block_fn = block + + if residual_unit == 'v2': + residual_unit = _bn_relu_conv + elif residual_unit == 'v1': + residual_unit = _conv_bn_relu + elif isinstance(residual_unit, six.string_types): + residual_unit = _string_to_function(residual_unit) + else: + residual_unit = residual_unit + + # Permute dimension order if necessary + if K.image_data_format() == 'channels_first': + input_shape = (input_shape[1], input_shape[2], input_shape[0]) + # Determine proper input shape + input_shape = _obtain_input_shape(input_shape, + default_size=32, + min_size=8, + data_format=K.image_data_format(), + require_flatten=include_top) + + img_input = Input(shape=input_shape, tensor=input_tensor) + x = _conv_bn_relu(filters=initial_filters, kernel_size=initial_kernel_size, strides=initial_strides)(img_input) + if initial_pooling == 'max': + x = MaxPooling2D(pool_size=(3, 3), strides=initial_strides, padding="same")(x) + + block = x + filters = initial_filters + for i, r in enumerate(repetitions): + transition_dilation_rates = [transition_dilation_rate] * r + transition_strides = [(1, 1)] * r + if transition_dilation_rate == (1, 1): + transition_strides[0] = (2, 2) + block = _residual_block(block_fn, filters=filters, + stage=i, blocks=r, + is_first_layer=(i == 0), + dropout=dropout, + transition_dilation_rates=transition_dilation_rates, + transition_strides=transition_strides, + residual_unit=residual_unit)(block) + filters *= 2 + + # Last activation + x = _bn_relu(block) + + # Classifier block + if include_top and top is 'classification': + x = GlobalAveragePooling2D()(x) + x = Dense(units=classes, activation=activation, kernel_initializer="he_normal")(x) + elif include_top and top is 'segmentation': + x = Conv2D(classes, (1, 1), activation='linear', padding='same')(x) + + if K.image_data_format() == 'channels_first': + channel, row, col = input_shape + else: + row, col, channel = input_shape + + x = Reshape((row * col, classes))(x) + x = Activation(activation)(x) + x = Reshape((row, col, classes))(x) + elif final_pooling == 'avg': + x = GlobalAveragePooling2D()(x) + elif final_pooling == 'max': + x = GlobalMaxPooling2D()(x) + + model = Model(inputs=img_input, outputs=x) + return model + + +def ResNet18(input_shape, classes): + """ResNet with 18 layers and v2 residual units + """ + return ResNet(input_shape, classes, basic_block, repetitions=[2, 2, 2, 2]) + + +def ResNet34(input_shape, classes): + """ResNet with 34 layers and v2 residual units + """ + return ResNet(input_shape, classes, basic_block, repetitions=[3, 4, 6, 3]) + + +def ResNet50(input_shape, classes): + """ResNet with 50 layers and v2 residual units + """ + return ResNet(input_shape, classes, bottleneck, repetitions=[3, 4, 6, 3]) + + +def ResNet101(input_shape, classes): + """ResNet with 101 layers and v2 residual units + """ + return ResNet(input_shape, classes, bottleneck, repetitions=[3, 4, 23, 3]) + + +def ResNet152(input_shape, classes): + """ResNet with 152 layers and v2 residual units + """ + return ResNet(input_shape, classes, bottleneck, repetitions=[3, 8, 36, 3]) diff --git a/keras_contrib/applications/wide_resnet.py b/keras_contrib/applications/wide_resnet.py index 647d63a..5df99f8 100644 --- a/keras_contrib/applications/wide_resnet.py +++ b/keras_contrib/applications/wide_resnet.py @@ -89,7 +89,7 @@ def WideResidualNetwork(depth=28, width=8, dropout_rate=0.0, default_size=32, min_size=8, data_format=K.image_dim_ordering(), - include_top=include_top) + require_flatten=include_top) if input_tensor is None: img_input = Input(shape=input_shape) diff --git a/keras_contrib/backend/cntk_backend.py b/keras_contrib/backend/cntk_backend.py index 363ad08..624aeee 100644 --- a/keras_contrib/backend/cntk_backend.py +++ b/keras_contrib/backend/cntk_backend.py @@ -1,2 +1,26 @@ from keras.backend import cntk_backend as KCN import cntk as C +import numpy as np + + +def clip(x, min_value, max_value): + """Element-wise value clipping. + + If min_value > max_value, clipping range is [min_value,min_value]. + + # Arguments + x: Tensor or variable. + min_value: Tensor, float, int, or None. + If min_value is None, defaults to -infinity. + max_value: Tensor, float, int, or None. + If max_value is None, defaults to infinity. + + # Returns + A tensor. + """ + if max_value is None: + max_value = np.inf + if min_value is None: + min_value = -np.inf + max_value = C.maximum(min_value, max_value) + return C.clip(x, min_value, max_value) diff --git a/keras_contrib/backend/tensorflow_backend.py b/keras_contrib/backend/tensorflow_backend.py index 7b69687..284cbe4 100644 --- a/keras_contrib/backend/tensorflow_backend.py +++ b/keras_contrib/backend/tensorflow_backend.py @@ -1,28 +1,71 @@ import tensorflow as tf +import numpy as np try: from tensorflow.python.ops import ctc_ops as ctc except ImportError: import tensorflow.contrib.ctc as ctc from keras.backend import tensorflow_backend as KTF -from keras.backend.common import floatx, image_data_format -from keras.backend.tensorflow_backend import _preprocess_conv3d_input -from keras.backend.tensorflow_backend import _postprocess_conv3d_output -from keras.backend.tensorflow_backend import _preprocess_padding -from keras.backend.tensorflow_backend import _preprocess_conv2d_input -from keras.backend.tensorflow_backend import _postprocess_conv2d_output +from keras.backend import dtype +from keras.backend.common import floatx +from keras.backend.common import image_data_format +from keras.backend.tensorflow_backend import _to_tensor py_all = all -def _preprocess_deconv_output_shape(x, shape, data_format): +def _preprocess_conv2d_input(x, data_format): + """Transpose and cast the input before the conv2d. + # Arguments + x: input tensor. + data_format: string, `"channels_last"` or `"channels_first"`. + # Returns + A tensor. + """ + if dtype(x) == 'float64': + x = tf.cast(x, 'float32') if data_format == 'channels_first': - shape = (shape[0],) + tuple(shape[2:]) + (shape[1],) + # TF uses the last dimension as channel dimension, + # instead of the 2nd one. + # TH input shape: (samples, input_depth, rows, cols) + # TF input shape: (samples, rows, cols, input_depth) + x = tf.transpose(x, (0, 2, 3, 1)) + return x - if shape[0] is None: - shape = (tf.shape(x)[0],) + tuple(shape[1:]) - shape = tf.stack(list(shape)) - return shape + +def _postprocess_conv2d_output(x, data_format): + """Transpose and cast the output from conv2d if needed. + # Arguments + x: A tensor. + data_format: string, `"channels_last"` or `"channels_first"`. + # Returns + A tensor. + """ + + if data_format == 'channels_first': + x = tf.transpose(x, (0, 3, 1, 2)) + + if floatx() == 'float64': + x = tf.cast(x, 'float64') + return x + + +def _preprocess_padding(padding): + """Convert keras' padding to tensorflow's padding. + # Arguments + padding: string, `"same"` or `"valid"`. + # Returns + a string, `"SAME"` or `"VALID"`. + # Raises + ValueError: if `padding` is invalid. + """ + if padding == 'same': + padding = 'SAME' + elif padding == 'valid': + padding = 'VALID' + else: + raise ValueError('Invalid padding:', padding) + return padding def conv2d(x, kernel, strides=(1, 1), padding='valid', data_format='channels_first', @@ -70,45 +113,6 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid', data_format='channels_fir return x -def deconv3d(x, kernel, output_shape, strides=(1, 1, 1), - padding='valid', - data_format='default', - image_shape=None, filter_shape=None): - '''3D deconvolution (i.e. transposed convolution). - - # Arguments - x: input tensor. - kernel: kernel tensor. - output_shape: 1D int tensor for the output shape. - strides: strides tuple. - padding: string, "same" or "valid". - data_format: "tf" or "th". - Whether to use Theano or TensorFlow dimension ordering - for inputs/kernels/ouputs. - - # Returns - A tensor, result of transposed 3D convolution. - - # Raises - ValueError: if `data_format` is neither `tf` or `th`. - ''' - if data_format == 'default': - data_format = image_data_format() - if data_format not in {'channels_first', 'channels_last'}: - raise ValueError('Unknown data_format ' + str(data_format)) - - x = _preprocess_conv3d_input(x, data_format) - output_shape = _preprocess_deconv_output_shape(x, output_shape, - data_format) - kernel = tf.transpose(kernel, (0, 1, 2, 4, 3)) - padding = _preprocess_padding(padding) - strides = (1,) + strides + (1,) - - x = tf.nn.conv3d_transpose(x, kernel, output_shape, strides, - padding=padding) - return _postprocess_conv3d_output(x, data_format) - - def extract_image_patches(x, ksizes, ssizes, padding='same', data_format='channels_last'): ''' @@ -158,3 +162,28 @@ def moments(x, axes, shift=None, keep_dims=False): ''' Wrapper over tensorflow backend call ''' return tf.nn.moments(x, axes, shift=shift, keep_dims=keep_dims) + + +def clip(x, min_value, max_value): + """Element-wise value clipping. + + If min_value > max_value, clipping range is [min_value,min_value]. + + # Arguments + x: Tensor or variable. + min_value: Tensor, float, int, or None. + If min_value is None, defaults to -infinity. + max_value: Tensor, float, int, or None. + If max_value is None, defaults to infinity. + + # Returns + A tensor. + """ + if max_value is None: + max_value = np.inf + if min_value is None: + min_value = -np.inf + min_value = _to_tensor(min_value, x.dtype.base_dtype) + max_value = _to_tensor(max_value, x.dtype.base_dtype) + max_value = tf.maximum(min_value, max_value) + return tf.clip_by_value(x, min_value, max_value) diff --git a/keras_contrib/backend/theano_backend.py b/keras_contrib/backend/theano_backend.py index 2b5adaf..78af0ef 100644 --- a/keras_contrib/backend/theano_backend.py +++ b/keras_contrib/backend/theano_backend.py @@ -1,5 +1,6 @@ from theano import tensor as T from theano.sandbox.neighbours import images2neibs +import numpy as np try: import theano.sparse as th_sparse_module @@ -85,56 +86,6 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid', data_format='channels_fir return conv_out -def deconv3d(x, kernel, output_shape, strides=(1, 1, 1), - padding='valid', - data_format=None, filter_shape=None): - '''3D deconvolution (transposed convolution). - - # Arguments - kernel: kernel tensor. - output_shape: desired dimensions of output. - strides: strides tuple. - padding: string, "same" or "valid". - data_format: "channels_last" or "channels_first". - Whether to use Theano or TensorFlow dimension ordering - in inputs/kernels/ouputs. - ''' - flip_filters = False - if data_format is None: - data_format = image_data_format() - if data_format not in {'channels_first', 'channels_last'}: - raise ValueError('Unknown data_format: ' + str(data_format)) - - if data_format == 'channels_last': - output_shape = (output_shape[0], output_shape[4], output_shape[1], - output_shape[2], output_shape[3]) - - x = _preprocess_conv3d_input(x, data_format) - kernel = _preprocess_conv3d_kernel(kernel, data_format) - kernel = kernel.dimshuffle((1, 0, 2, 3, 4)) - th_padding = _preprocess_padding(padding) - - if hasattr(kernel, '_keras_shape'): - kernel_shape = kernel._keras_shape - else: - # Will only work if `kernel` is a shared variable. - kernel_shape = kernel.eval().shape - - filter_shape = _preprocess_conv3d_filter_shape(filter_shape, data_format) - filter_shape = tuple(filter_shape[i] for i in (1, 0, 2, 3, 4)) - - conv_out = T.nnet.abstract_conv.conv3d_grad_wrt_inputs( - x, kernel, output_shape, - filter_shape=filter_shape, - border_mode=th_padding, - subsample=strides, - filter_flip=not flip_filters) - - conv_out = _postprocess_conv3d_output(conv_out, x, padding, - kernel_shape, strides, data_format) - return conv_out - - def extract_image_patches(X, ksizes, strides, padding='valid', data_format='channels_first'): ''' Extract the patches from an image @@ -197,3 +148,26 @@ def moments(x, axes, shift=None, keep_dims=False): var_batch = KTH.var(x, axis=axes, keepdims=keep_dims) return mean_batch, var_batch + + +def clip(x, min_value, max_value): + """Element-wise value clipping. + + If min_value > max_value, clipping range is [min_value,min_value]. + + # Arguments + x: Tensor or variable. + min_value: Tensor, float, int, or None. + If min_value is None, defaults to -infinity. + max_value: Tensor, float, int, or None. + If max_value is None, defaults to infinity. + + # Returns + A tensor. + """ + if max_value is None: + max_value = np.inf + if min_value is None: + min_value = -np.inf + max_value = T.maximum(min_value, max_value) + return T.clip(x, min_value, max_value) diff --git a/keras_contrib/callbacks/dead_relu_detector.py b/keras_contrib/callbacks/dead_relu_detector.py index 2019f56..2cfe37b 100644 --- a/keras_contrib/callbacks/dead_relu_detector.py +++ b/keras_contrib/callbacks/dead_relu_detector.py @@ -1,8 +1,6 @@ import numpy as np -import warnings from keras.callbacks import Callback -from keras.layers import Dense from keras import backend as K @@ -13,10 +11,11 @@ class DeadReluDetector(Callback): # Arguments x_train: Training dataset to check whether or not neurons fire verbose: verbosity mode - True means that even a single dead neuron triggers warning + True means that even a single dead neuron triggers a warning message False means that only significant number of dead neurons (10% or more) - triggers warning + triggers a warning message """ + def __init__(self, x_train, verbose=False): super(DeadReluDetector, self).__init__() self.x_train = x_train @@ -25,7 +24,8 @@ class DeadReluDetector(Callback): @staticmethod def is_relu_layer(layer): - return isinstance(layer, Dense) and layer.get_config()['activation'] == 'relu' + # Should work for all layers with relu activation. Tested for Dense and Conv2D + return 'activation' in layer.get_config() and layer.get_config()['activation'] == 'relu' def get_relu_activations(self): model_input = self.model.input @@ -44,17 +44,43 @@ class DeadReluDetector(Callback): layer_outputs = [func(list_inputs)[0] for func in funcs] for layer_index, layer_activations in enumerate(layer_outputs): if self.is_relu_layer(self.model.layers[layer_index]): - yield [layer_index, layer_activations] + layer_name = self.model.layers[layer_index].name + # layer_weight is a list [W] (+ [b]) + layer_weight = self.model.layers[layer_index].get_weights() + # with kernel and bias, the weights are saved as a list [W, b]. If only weights, it is [W] + if type(layer_weight) is not list: + raise ValueError("'Layer_weight' should be a list, but was {}".format(type(layer_weight))) + + layer_weight_shape = np.shape(layer_weight[0]) + yield [layer_index, layer_activations, layer_name, layer_weight_shape] def on_epoch_end(self, epoch, logs={}): for relu_activation in self.get_relu_activations(): - layer_index, activation_values = relu_activation - total_neurons = activation_values.shape[-1] - dead_neurons = np.sum(activation_values == 0) - dead_neurons_share = dead_neurons / total_neurons - if (self.verbose and dead_neurons > 0) or dead_neurons_share > self.dead_neurons_share_threshold: - warnings.warn( - 'Layer #{} has {} dead neurons ({:.2%})!' - .format(layer_index, dead_neurons, dead_neurons_share), - RuntimeWarning - ) + layer_index, activation_values, layer_name, layer_weight_shape = relu_activation + + shape_act = activation_values.shape + + weight_len = len(layer_weight_shape) + act_len = len(shape_act) + + # should work for both Conv and Flat + if K.image_data_format() == 'channels_last': + # features in last axis + axis_filter = -1 + else: + # features before the convolution axis, for weight_len the input and output have to be subtracted + axis_filter = -1 - (weight_len - 2) + + total_featuremaps = shape_act[axis_filter] + + axis = tuple( + i for i in range(act_len) if (i != axis_filter) and (i != (len(shape_act) + axis_filter))) + + dead_neurons = np.sum(np.sum(activation_values, axis=axis) == 0) + + dead_neurons_share = float(dead_neurons) / float(total_featuremaps) + if (self.verbose and dead_neurons > 0) or dead_neurons_share >= self.dead_neurons_share_threshold: + str_warning = 'Layer {} (#{}) has {} dead neurons ({:.2%})!'.format(layer_name, layer_index, + dead_neurons, dead_neurons_share) + + print(str_warning) diff --git a/keras_contrib/datasets/conll2000.py b/keras_contrib/datasets/conll2000.py old mode 100644 new mode 100755 index 22a97e1..5561f17 --- a/keras_contrib/datasets/conll2000.py +++ b/keras_contrib/datasets/conll2000.py @@ -16,7 +16,7 @@ def load_data(path='conll2000.zip', min_freq=2): archive.close() word_counts = Counter(row[0].lower() for sample in train for row in sample) - vocab = ['', ''] + [w for w, f in word_counts.iteritems() if f >= min_freq] + vocab = ['', ''] + [w for w, f in iter(word_counts.items()) if f >= min_freq] pos_tags = sorted(list(set(row[1] for sample in train + test for row in sample))) # in alphabetic order chunk_tags = sorted(list(set(row[2] for sample in train + test for row in sample))) # in alphabetic order @@ -27,7 +27,7 @@ def load_data(path='conll2000.zip', min_freq=2): def _parse_data(fh): string = fh.read() - data = [[row.split() for row in sample.split('\n')] for sample in string.strip().split('\n\n')] + data = [[row.split() for row in sample.split('\n')] for sample in string.decode().strip().split('\n\n')] fh.close() return data diff --git a/keras_contrib/layers/advanced_activations.py b/keras_contrib/layers/advanced_activations.py index 179856c..7bf349e 100644 --- a/keras_contrib/layers/advanced_activations.py +++ b/keras_contrib/layers/advanced_activations.py @@ -236,3 +236,50 @@ class SReLU(Layer): return dict(list(base_config.items()) + list(config.items())) get_custom_objects().update({'SReLU': SReLU}) + + +class Swish(Layer): + """ Swish (Ramachandranet al., 2017) + + # Input shape + Arbitrary. Use the keyword argument `input_shape` + (tuple of integers, does not include the samples axis) + when using this layer as the first layer in a model. + + # Output shape + Same shape as the input. + + # Arguments + beta: float >= 0. Scaling factor + if set to 1 and trainable set to False (default), Swish equals the SiLU activation (Elfwing et al., 2017) + trainable: whether to learn the scaling factor during training or not + + # References + - [Searching for Activation Functions](https://arxiv.org/abs/1710.05941) + - [Sigmoid-weighted linear units for neural network function approximation in reinforcement learning](https://arxiv.org/abs/1702.03118) + """ + + def __init__(self, beta=1.0, trainable=False, **kwargs): + super(Swish, self).__init__(**kwargs) + self.supports_masking = True + self.beta = beta + self.trainable = trainable + + def build(self, input_shape): + self.scaling_factor = K.variable(self.beta, + dtype=K.floatx(), + name='scaling_factor') + if self.trainable: + self._trainable_weights.append(self.scaling_factor) + super(Swish, self).build(input_shape) + + def call(self, inputs, mask=None): + return inputs * K.sigmoid(self.scaling_factor * inputs) + + def get_config(self): + config = {'beta': self.get_weights()[0] if self.trainable else self.beta, + 'trainable': self.trainable} + base_config = super(Swish, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + +get_custom_objects().update({'Swish': Swish}) diff --git a/keras_contrib/layers/convolutional.py b/keras_contrib/layers/convolutional.py index c60df62..0899309 100644 --- a/keras_contrib/layers/convolutional.py +++ b/keras_contrib/layers/convolutional.py @@ -16,220 +16,6 @@ from keras.utils.conv_utils import normalize_data_format import numpy as np -class Deconvolution3D(Convolution3D): - """Transposed convolution operator for filtering windows of 3-D inputs. - - The need for transposed convolutions generally arises from the desire to - use a transformation going in the opposite direction - of a normal convolution, i.e., from something that has the shape - of the output of some convolution to something that has the shape - of its input while maintaining a connectivity pattern - that is compatible with said convolution. - - When using this layer as the first layer in a model, - provide the keyword argument `input_shape` - (tuple of integers, does not include the sample axis), - e.g. `input_shape=(3, 128, 128, 128)` for a 128x128x128 volume with - three channels. - - To pass the correct `output_shape` to this layer, - one could use a test model to predict and observe the actual output shape. - - # Examples - - ```python - # TH dim ordering. - # apply a 3x3x3 transposed convolution - # with stride 1x1x1 and 3 output filters on a 12x12x12 image: - model = Sequential() - model.add(Deconvolution3D(3, 3, 3, 3, output_shape=(None, 3, 14, 14, 14), - padding='valid', - input_shape=(3, 12, 12, 12))) - - # we can predict with the model and print the shape of the array. - dummy_input = np.ones((32, 3, 12, 12, 12)) - preds = model.predict(dummy_input) - print(preds.shape) # (None, 3, 14, 14, 14) - - # apply a 3x3x3 transposed convolution - # with stride 2x2x2 and 3 output filters on a 12x12x12 image: - model = Sequential() - model.add(Deconvolution3D(3, 3, 3, 3, output_shape=(None, 3, 25, 25, 25), - strides=(2, 2, 2), - padding='valid', - input_shape=(3, 12, 12, 12))) - model.summary() - - # we can predict with the model and print the shape of the array. - dummy_input = np.ones((32, 3, 12, 12, 12)) - preds = model.predict(dummy_input) - print(preds.shape) # (None, 3, 25, 25, 25) - ``` - - ```python - # TF dim ordering. - # apply a 3x3x3 transposed convolution - # with stride 1x1x1 and 3 output filters on a 12x12x12 image: - model = Sequential() - model.add(Deconvolution3D(3, 3, 3, 3, output_shape=(None, 14, 14, 14, 3), - padding='valid', - input_shape=(12, 12, 12, 3))) - - # we can predict with the model and print the shape of the array. - dummy_input = np.ones((32, 12, 12, 12, 3)) - preds = model.predict(dummy_input) - print(preds.shape) # (None, 14, 14, 14, 3) - - # apply a 3x3x3 transposed convolution - # with stride 2x2x2 and 3 output filters on a 12x12x12 image: - model = Sequential() - model.add(Deconvolution3D(3, 3, 3, 3, output_shape=(None, 25, 25, 25, 3), - strides=(2, 2, 2), - padding='valid', - input_shape=(12, 12, 12, 3))) - model.summary() - - # we can predict with the model and print the shape of the array. - dummy_input = np.ones((32, 12, 12, 12, 3)) - preds = model.predict(dummy_input) - print(preds.shape) # (None, 25, 25, 25, 3) - ``` - - # Arguments - filters: Number of transposed convolution filters to use. - kernel_size: kernel_size: An integer or tuple/list of 3 integers, specifying the - dimensions of the convolution window. - output_shape: Output shape of the transposed convolution operation. - tuple of integers - `(nb_samples, filters, conv_dim1, conv_dim2, conv_dim3)`. - It is better to use - a dummy input and observe the actual output shape of - a layer, as specified in the examples. - init: name of initialization function for the weights of the layer - (see [initializers](../initializers.md)), or alternatively, - Theano function to use for weights initialization. - This parameter is only relevant if you don't pass - a `weights` argument. - activation: name of activation function to use - (see [activations](../activations.md)), - or alternatively, elementwise Theano/TensorFlow function. - If you don't specify anything, no activation is applied - (ie. "linear" activation: a(x) = x). - weights: list of numpy arrays to set as initial weights. - padding: 'valid', 'same' or 'full' - ('full' requires the Theano backend). - strides: tuple of length 3. Factor by which to oversample output. - Also called strides elsewhere. - kernel_regularizer: instance of [WeightRegularizer](../regularizers.md) - (eg. L1 or L2 regularization), applied to the main weights matrix. - bias_regularizer: instance of [WeightRegularizer](../regularizers.md), - applied to the use_bias. - activity_regularizer: instance of [ActivityRegularizer](../regularizers.md), - applied to the network output. - kernel_constraint: instance of the [constraints](../constraints.md) module - (eg. maxnorm, nonneg), applied to the main weights matrix. - bias_constraint: instance of the [constraints](../constraints.md) module, - applied to the use_bias. - data_format: 'channels_first' or 'channels_last'. In 'channels_first' mode, the channels dimension - (the depth) is at index 1, in 'channels_last' mode is it at index 4. - It defaults to the `image_data_format` value found in your - Keras config file at `~/.keras/keras.json`. - If you never set it, then it will be "tf". - use_bias: whether to include a use_bias - (i.e. make the layer affine rather than linear). - - # Input shape - 5D tensor with shape: - `(samples, channels, conv_dim1, conv_dim2, conv_dim3)` if data_format='channels_first' - or 5D tensor with shape: - `(samples, conv_dim1, conv_dim2, conv_dim3, channels)` if data_format='channels_last'. - - # Output shape - 5D tensor with shape: - `(samples, filters, nekernel_conv_dim1, nekernel_conv_dim2, nekernel_conv_dim3)` if data_format='channels_first' - or 5D tensor with shape: - `(samples, nekernel_conv_dim1, nekernel_conv_dim2, nekernel_conv_dim3, filters)` if data_format='channels_last'. - `nekernel_conv_dim1`, `nekernel_conv_dim2` and `nekernel_conv_dim3` values might have changed due to padding. - - # References - - [A guide to convolution arithmetic for deep learning](https://arxiv.org/abs/1603.07285v1) - - [Transposed convolution arithmetic](http://deeplearning.net/software/theano_versions/dev/tutorial/conv_arithmetic.html#transposed-convolution-arithmetic) - - [Deconvolutional Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf) - """ - - def __init__(self, filters, kernel_size, - output_shape, activation=None, weights=None, - padding='valid', strides=(1, 1, 1), data_format=None, - kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, - kernel_constraint=None, bias_constraint=None, - use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', **kwargs): - if padding not in {'valid', 'same', 'full'}: - raise ValueError('Invalid border mode for Deconvolution3D:', padding) - if len(output_shape) == 4: - # missing the batch size - output_shape = (None,) + tuple(output_shape) - - self.output_shape_ = output_shape - - super(Deconvolution3D, self).__init__(kernel_size=kernel_size, - filters=filters, - activation=activation, - weights=weights, - padding=padding, - strides=strides, - data_format=data_format, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - activity_regularizer=activity_regularizer, - kernel_constraint=kernel_constraint, - bias_constraint=bias_constraint, - use_bias=use_bias, - kernel_initializer=kernel_initializer, - bias_initializer=bias_initializer, - **kwargs) - - def compute_output_shape(self, input_shape): - if self.data_format == 'channels_first': - conv_dim1 = self.output_shape_[2] - conv_dim2 = self.output_shape_[3] - conv_dim3 = self.output_shape_[4] - return (input_shape[0], self.filters, conv_dim1, conv_dim2, conv_dim3) - elif self.data_format == 'channels_last': - conv_dim1 = self.output_shape_[1] - conv_dim2 = self.output_shape_[2] - conv_dim3 = self.output_shape_[3] - return (input_shape[0], conv_dim1, conv_dim2, conv_dim3, self.filters) - else: - raise ValueError('Invalid data format: ', self.data_format) - - def call(self, x, mask=None): - kernel_shape = K.get_value(self.kernel).shape - output = K.deconv3d(x, self.kernel, self.output_shape_, - strides=self.strides, - padding=self.padding, - data_format=self.data_format, - filter_shape=kernel_shape) - if self.use_bias: - if self.data_format == 'channels_first': - output += K.reshape(self.bias, (1, self.filters, 1, 1, 1)) - elif self.data_format == 'channels_last': - output += K.reshape(self.bias, (1, 1, 1, 1, self.filters)) - else: - raise ValueError('Invalid data_format: ', self.data_format) - output = self.activation(output) - return output - - def get_config(self): - config = {'output_shape': self.output_shape_} - base_config = super(Deconvolution3D, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - -Deconv3D = Deconvolution3D -get_custom_objects().update({'Deconvolution3D': Deconvolution3D}) -get_custom_objects().update({'Deconv3D': Deconv3D}) - - class CosineConvolution2D(Layer): """Cosine Normalized Convolution operator for filtering windows of two-dimensional inputs. Cosine Normalization: Using Cosine Similarity Instead of Dot Product in Neural Networks diff --git a/keras_contrib/layers/normalization.py b/keras_contrib/layers/normalization.py index 53b356b..40254cd 100644 --- a/keras_contrib/layers/normalization.py +++ b/keras_contrib/layers/normalization.py @@ -219,7 +219,7 @@ class BatchRenormalization(Layer): self.initial_weights = weights self.r_max_value = r_max_value self.d_max_value = d_max_value - self.t_delta = K.variable(np.array(t_delta)) + self.t_delta = t_delta self.beta_initializer = initializers.get(beta_initializer) self.gamma_initializer = initializers.get(gamma_initializer) self.moving_mean_initializer = initializers.get(moving_mean_initializer) @@ -266,11 +266,13 @@ class BatchRenormalization(Layer): name='{}_running_std'.format(self.name), trainable=False) - self.r_max = K.variable(np.ones((1,)), name='{}_r_max'.format(self.name)) + self.r_max = K.variable(1, name='{}_r_max'.format(self.name)) - self.d_max = K.variable(np.zeros((1,)), name='{}_d_max'.format(self.name)) + self.d_max = K.variable(0, name='{}_d_max'.format(self.name)) - self.t = K.variable(np.zeros((1,)), name='{}_t'.format(self.name)) + self.t = K.variable(0, name='{}_t'.format(self.name)) + + self.t_delta_tensor = K.constant(self.t_delta) if self.initial_weights is not None: self.set_weights(self.initial_weights) @@ -290,13 +292,11 @@ class BatchRenormalization(Layer): mean_batch, var_batch = K.moments(inputs, reduction_axes, shift=None, keep_dims=False) std_batch = (K.sqrt(var_batch + self.epsilon)) - r_max_value = K.get_value(self.r_max) r = std_batch / (K.sqrt(self.running_variance + self.epsilon)) - r = K.stop_gradient(K.clip(r, 1 / r_max_value, r_max_value)) + r = K.stop_gradient(K.clip(r, 1 / self.r_max, self.r_max)) - d_max_value = K.get_value(self.d_max) d = (mean_batch - self.running_mean) / K.sqrt(self.running_variance + self.epsilon) - d = K.stop_gradient(K.clip(d, -d_max_value, d_max_value)) + d = K.stop_gradient(K.clip(d, -self.d_max, self.d_max)) if sorted(reduction_axes) == range(K.ndim(inputs))[:-1]: x_normed_batch = (inputs - mean_batch) / std_batch @@ -323,7 +323,7 @@ class BatchRenormalization(Layer): self.add_update([K.update(self.r_max, r_val), K.update(self.d_max, d_val), - K.update_add(self.t, self.t_delta)], x) + K.update_add(self.t, self.t_delta_tensor)], inputs) if training in {0, False}: return x_normed @@ -358,13 +358,15 @@ class BatchRenormalization(Layer): def get_config(self): config = {'epsilon': self.epsilon, 'axis': self.axis, + 'center': self.center, + 'scale': self.scale, + 'momentum': self.momentum, 'gamma_regularizer': initializers.serialize(self.gamma_regularizer), 'beta_regularizer': initializers.serialize(self.beta_regularizer), 'moving_mean_initializer': initializers.serialize(self.moving_mean_initializer), 'moving_variance_initializer': initializers.serialize(self.moving_variance_initializer), 'beta_constraint': constraints.serialize(self.beta_constraint), 'gamma_constraint': constraints.serialize(self.gamma_constraint), - 'momentum': self.momentum, 'r_max_value': self.r_max_value, 'd_max_value': self.d_max_value, 't_delta': self.t_delta} diff --git a/keras_contrib/layers/recurrent.py b/keras_contrib/layers/recurrent.py index e85dc22..c85a6c6 100644 --- a/keras_contrib/layers/recurrent.py +++ b/keras_contrib/layers/recurrent.py @@ -8,5 +8,3 @@ from .. import initializers from .. import regularizers from keras.engine import Layer from keras.engine import InputSpec - -from keras.layers.recurrent import _time_distributed_dense diff --git a/keras_contrib/optimizers/ftml.py b/keras_contrib/optimizers/ftml.py index edfb9d7..7545934 100644 --- a/keras_contrib/optimizers/ftml.py +++ b/keras_contrib/optimizers/ftml.py @@ -2,7 +2,6 @@ from __future__ import absolute_import from keras.optimizers import Optimizer from .. import backend as K from keras.utils.generic_utils import get_custom_objects -from keras.legacy import interfaces class FTML(Optimizer): @@ -31,7 +30,6 @@ class FTML(Optimizer): self.epsilon = epsilon self.inital_decay = decay - @interfaces.legacy_get_updates_support def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] diff --git a/setup.py b/setup.py index aad0567..3c17537 100644 --- a/setup.py +++ b/setup.py @@ -3,11 +3,32 @@ from setuptools import find_packages setup(name='keras_contrib', - version='1.2.1', - description='Keras community contributions', + version='2.0.8', + description='Keras Deep Learning for Python, Community Contributions', author='Fariz Rahman', author_email='farizrahman4u@gmail.com', url='https://github.com/farizrahman4u/keras-contrib', license='MIT', install_requires=['keras'], + extras_require={ + 'h5py': ['h5py'], + 'visualize': ['pydot>=1.2.0'], + 'tests': ['pytest', + 'pytest-pep8', + 'pytest-xdist', + 'pytest-cov'], + }, + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', + 'Topic :: Software Development :: Libraries', + 'Topic :: Software Development :: Libraries :: Python Modules' + ], packages=find_packages()) diff --git a/tests/keras_contrib/backend/backend_test.py b/tests/keras_contrib/backend/backend_test.py index 64135b0..c3829e3 100644 --- a/tests/keras_contrib/backend/backend_test.py +++ b/tests/keras_contrib/backend/backend_test.py @@ -1,7 +1,6 @@ import pytest from numpy.testing import assert_allclose import numpy as np -import scipy.sparse as sparse from keras import backend as K from keras.backend import theano_backend as KTH, floatx, set_floatx, variable @@ -157,8 +156,46 @@ class TestBackend(object): th_var_val = KTH.eval(th_var) tf_var_val = KTF.eval(tf_var) - assert_allclose(th_mean_val, tf_mean_val, rtol=1e-4) - assert_allclose(th_var_val, tf_var_val, rtol=1e-4) + # absolute tolerance needed when working with zeros + assert_allclose(th_mean_val, tf_mean_val, rtol=1e-4, atol=1e-10) + assert_allclose(th_var_val, tf_var_val, rtol=1e-4, atol=1e-10) + + def test_clip(self): + check_single_tensor_operation('clip', (4, 2), min_value=0.4, max_value=0.6) + check_single_tensor_operation('clip', (4, 2), min_value=0.4, max_value=None) + + cases = [ + # (x, min_value, max_value, expected) + (1, 0, 2, 1), + (1, 2, 0, 2), + (-1, 0, 2, 0), + (-1, 2, 0, 2), + (3, 0, 2, 2), + (3, 2, 0, 2), + (1, 0, np.inf, 1), + (1, np.inf, 0, np.inf), + (1, 0, -np.inf, 0), + (1, -np.inf, 0, 0), + (-1, 0, -np.inf, 0), + (-1, -np.inf, 0, -1), + (1, 0, None, 1), + (-1, 0, None, 0), + + # NOTE: In the following two cases, Keras 2.0.8 raises an + # error on all backends, but this is a sensible extension. + (1, None, 0, 0), + (-1, None, 0, -1), + + # NOTE: In the following case, Keras 2.0.8 rasies an error + # for TensorFlow and Theano, but returns 0 for CNTK. This + # extends the TensorFlow and Theano backends to match the + # CNTK behavior instead of raising an error. + (0, None, None, 0), + ] + for K_, KC_ in [(KTF, KCTF), (KTH, KCTH)]: + for x, min_value, max_value, expected in cases: + actual = K_.eval(KC_.clip(K_.constant(x), min_value, max_value)) + assert_allclose(expected, actual, atol=1e-5) if __name__ == '__main__': diff --git a/tests/keras_contrib/callbacks/dead_relu_detector_test.py b/tests/keras_contrib/callbacks/dead_relu_detector_test.py index 9a37df9..5f7c396 100644 --- a/tests/keras_contrib/callbacks/dead_relu_detector_test.py +++ b/tests/keras_contrib/callbacks/dead_relu_detector_test.py @@ -1,40 +1,191 @@ import pytest -import warnings import numpy as np +import sys + +if (sys.version_info > (3, 0)): + from io import StringIO +else: + from StringIO import StringIO from keras_contrib import callbacks from keras.models import Sequential -from keras.layers import Dense +from keras.layers import Dense, Conv2D, Flatten +from keras import backend as K + +n_out = 11 # with 1 neuron dead, 1/11 is just below the threshold of 10% with verbose = False + + +def check_print(do_train, expected_warnings, nr_dead=None, perc_dead=None): + """ + Receive stdout to check if correct warning message is delivered + :param nr_dead: int + :param perc_dead: float, 10% should be written as 0.1 + """ + + saved_stdout = sys.stdout + + out = StringIO() + out.flush() + sys.stdout = out # overwrite current stdout + + do_train() + + stdoutput = out.getvalue().strip() # get prints, can be something like: "Layer dense (#0) has 2 dead neurons (20.00%)!" + str_to_count = "dead neurons" + count = stdoutput.count(str_to_count) + + sys.stdout = saved_stdout # restore stdout + out.close() + + assert expected_warnings == count + if expected_warnings and (nr_dead is not None): + str_to_check = 'has {} dead'.format(nr_dead) + assert str_to_check in stdoutput, '"{}" not in "{}"'.format(str_to_check, stdoutput) + if expected_warnings and (perc_dead is not None): + str_to_check = 'neurons ({:.2%})!'.format(perc_dead) + assert str_to_check in stdoutput, '"{}" not in "{}"'.format(str_to_check, stdoutput) def test_DeadDeadReluDetector(): - def do_test(weights, expected_warnings, verbose): - with warnings.catch_warnings(record=True) as w: - dataset = np.ones((1, 1, 1)) # data to be fed as training + n_samples = 9 + + input_shape = (n_samples, 3, 4) # 4 input features + shape_out = (n_samples, 3, n_out) # 11 output features + shape_weights = (4, n_out) + + # ignore batch size + input_shape_dense = tuple(input_shape[1:]) + + def do_test(weights, expected_warnings, verbose, nr_dead=None, perc_dead=None): + + def do_train(): + dataset = np.ones(input_shape) # data to be fed as training model = Sequential() - model.add(Dense(10, activation='relu', input_shape=(1, 1), use_bias=False, weights=[weights])) + model.add(Dense(n_out, activation='relu', input_shape=input_shape_dense, + use_bias=False, weights=[weights], name='dense')) model.compile(optimizer='sgd', loss='categorical_crossentropy') model.fit( dataset, - np.ones((1, 1, 10)), + np.ones(shape_out), + batch_size=1, epochs=1, callbacks=[callbacks.DeadReluDetector(dataset, verbose=verbose)], verbose=False ) - assert len(w) == expected_warnings - for warn_item in w: - assert issubclass(warn_item.category, RuntimeWarning) - assert "dead neurons" in str(warn_item.message) - weights_1_dead = np.ones((1, 10)) # weights that correspond to NN with 1/10 neurons dead + check_print(do_train, expected_warnings, nr_dead, perc_dead) + + weights_1_dead = np.ones(shape_weights) # weights that correspond to NN with 1/11 neurons dead + weights_2_dead = np.ones(shape_weights) # weights that correspond to NN with 2/11 neurons dead + weights_all_dead = np.zeros(shape_weights) # weights that correspond to all neurons dead + weights_1_dead[:, 0] = 0 - weights_2_dead = np.ones((1, 10)) # weights that correspond to NN with 2/10 neurons dead - weights_2_dead[:, 0] = 0 - weights_2_dead[:, 1] = 0 + weights_2_dead[:, 0:2] = 0 - do_test(weights_1_dead, verbose=True, expected_warnings=1) + do_test(weights_1_dead, verbose=True, expected_warnings=1, nr_dead=1, perc_dead=1. / n_out) do_test(weights_1_dead, verbose=False, expected_warnings=0) - do_test(weights_2_dead, verbose=True, expected_warnings=1) + do_test(weights_2_dead, verbose=True, expected_warnings=1, nr_dead=2, perc_dead=2. / n_out) + # do_test(weights_all_dead, verbose=True, expected_warnings=1, nr_dead=n_out, perc_dead=1.) + + +def test_DeadDeadReluDetector_bias(): + n_samples = 9 + + input_shape = (n_samples, 4) # 4 input features + shape_weights = (4, n_out) + shape_bias = (n_out, ) + shape_out = (n_samples, n_out) # 11 output features + + # ignore batch size + input_shape_dense = tuple(input_shape[1:]) + + def do_test(weights, bias, expected_warnings, verbose, nr_dead=None, perc_dead=None): + + def do_train(): + dataset = np.ones(input_shape) # data to be fed as training + model = Sequential() + model.add(Dense(n_out, activation='relu', input_shape=input_shape_dense, + use_bias=True, weights=[weights, bias], name='dense')) + model.compile(optimizer='sgd', loss='categorical_crossentropy') + model.fit( + dataset, + np.ones(shape_out), + batch_size=1, + epochs=1, + callbacks=[callbacks.DeadReluDetector(dataset, verbose=verbose)], + verbose=False + ) + + check_print(do_train, expected_warnings, nr_dead, perc_dead) + + weights_1_dead = np.ones(shape_weights) # weights that correspond to NN with 1/11 neurons dead + weights_2_dead = np.ones(shape_weights) # weights that correspond to NN with 2/11 neurons dead + weights_all_dead = np.zeros(shape_weights) # weights that correspond to all neurons dead + + weights_1_dead[:, 0] = 0 + weights_2_dead[:, 0:2] = 0 + + bias = np.zeros(shape_bias) + + do_test(weights_1_dead, bias, verbose=True, expected_warnings=1, nr_dead=1, perc_dead=1. / n_out) + do_test(weights_1_dead, bias, verbose=False, expected_warnings=0) + do_test(weights_2_dead, bias, verbose=True, expected_warnings=1, nr_dead=2, perc_dead=2. / n_out) + # do_test(weights_all_dead, bias, verbose=True, expected_warnings=1, nr_dead=n_out, perc_dead=1.) + + +def test_DeadDeadReluDetector_conv(): + n_samples = 9 + + # (5, 5) kernel, 4 input featuremaps and 11 output featuremaps + if K.image_data_format() == 'channels_last': + input_shape = (n_samples, 5, 5, 4) + else: + input_shape = (n_samples, 4, 5, 5) + + # ignore batch size + input_shape_conv = tuple(input_shape[1:]) + shape_weights = (5, 5, 4, n_out) + shape_out = (n_samples, n_out) + + def do_test(weights_bias, expected_warnings, verbose, nr_dead=None, perc_dead=None): + """ + :param perc_dead: as float, 10% should be written as 0.1 + """ + + def do_train(): + dataset = np.ones(input_shape) # data to be fed as training + model = Sequential() + model.add(Conv2D(n_out, (5, 5), activation='relu', input_shape=input_shape_conv, + use_bias=True, weights=weights_bias, name='conv')) + model.add(Flatten()) # to handle Theano's categorical crossentropy + model.compile(optimizer='sgd', loss='categorical_crossentropy') + model.fit( + dataset, + np.ones(shape_out), + batch_size=1, + epochs=1, + callbacks=[callbacks.DeadReluDetector(dataset, verbose=verbose)], + verbose=False + ) + + check_print(do_train, expected_warnings, nr_dead, perc_dead) + + weights_1_dead = np.ones(shape_weights) # weights that correspond to NN with 1/11 neurons dead + weights_1_dead[..., 0] = 0 + weights_2_dead = np.ones(shape_weights) # weights that correspond to NN with 2/11 neurons dead + weights_2_dead[..., 0:2] = 0 + weights_all_dead = np.zeros(shape_weights) # weights that correspond to NN with all neurons dead + + bias = np.zeros((11, )) + + weights_bias_1_dead = [weights_1_dead, bias] + weights_bias_2_dead = [weights_2_dead, bias] + weights_bias_all_dead = [weights_all_dead, bias] + + do_test(weights_bias_1_dead, verbose=True, expected_warnings=1, nr_dead=1, perc_dead=1. / n_out) + do_test(weights_bias_1_dead, verbose=False, expected_warnings=0) + do_test(weights_bias_2_dead, verbose=True, expected_warnings=1, nr_dead=2, perc_dead=2. / n_out) + # do_test(weights_bias_all_dead, verbose=True, expected_warnings=1, nr_dead=n_out, perc_dead=1.) if __name__ == '__main__': diff --git a/tests/keras_contrib/layers/test_advanced_activations.py b/tests/keras_contrib/layers/test_advanced_activations.py index f07a690..8c71426 100644 --- a/tests/keras_contrib/layers/test_advanced_activations.py +++ b/tests/keras_contrib/layers/test_advanced_activations.py @@ -26,5 +26,18 @@ def test_srelu_share(): layer_test(advanced_activations.SReLU, kwargs={'shared_axes': 1}, input_shape=(2, 3, 4)) + +@keras_test +def test_swish_constant(): + layer_test(advanced_activations.Swish, kwargs={'beta': 1.0, 'trainable': False}, + input_shape=(2, 3, 4)) + + +@keras_test +def test_swish_trainable(): + layer_test(advanced_activations.Swish, kwargs={'beta': 1.0, 'trainable': True}, + input_shape=(2, 3, 4)) + + if __name__ == '__main__': pytest.main([__file__]) diff --git a/tests/keras_contrib/layers/test_convolutional.py b/tests/keras_contrib/layers/test_convolutional.py index 1760226..d207656 100644 --- a/tests/keras_contrib/layers/test_convolutional.py +++ b/tests/keras_contrib/layers/test_convolutional.py @@ -17,67 +17,6 @@ else: _convolution_border_modes = ['valid', 'same'] -@keras_test -def test_deconvolution_3d(): - num_samples = 6 - num_filter = 4 - stack_size = 2 - kernel_dim1 = 12 - kernel_dim2 = 10 - kernel_dim3 = 8 - - for batch_size in [None, num_samples]: - for border_mode in _convolution_border_modes: - for subsample in [(1, 1, 1), (2, 2, 2)]: - if border_mode == 'same' and subsample != (1, 1, 1): - continue - - dim1 = conv_input_length(kernel_dim1, 7, - border_mode, - subsample[0]) - dim2 = conv_input_length(kernel_dim2, 5, - border_mode, - subsample[1]) - dim3 = conv_input_length(kernel_dim3, 3, - border_mode, - subsample[2]) - layer_test(convolutional.Deconvolution3D, - kwargs={'filters': num_filter, - 'kernel_size': (7, 5, 3), - 'output_shape': (batch_size, num_filter, dim1, dim2, dim3), - 'padding': border_mode, - 'strides': subsample, - 'data_format': 'channels_first'}, - input_shape=(num_samples, stack_size, kernel_dim1, kernel_dim2, kernel_dim3), - - fixed_batch_size=True, tolerance=None) - - layer_test(convolutional.Deconvolution3D, - kwargs={'filters': num_filter, - 'kernel_size': (7, 5, 3), - 'output_shape': (batch_size, num_filter, dim1, dim2, dim3), - 'padding': border_mode, - 'strides': subsample, - 'data_format': 'channels_first', - 'kernel_regularizer': 'l2', - 'bias_regularizer': 'l2', - 'activity_regularizer': 'l2'}, - input_shape=(num_samples, stack_size, kernel_dim1, kernel_dim2, kernel_dim3), - fixed_batch_size=True, tolerance=None) - - layer_test(convolutional.Deconvolution3D, - kwargs={'filters': num_filter, - 'kernel_size': (7, 5, 3), - 'output_shape': (num_filter, dim1, dim2, dim3), - 'padding': border_mode, - 'strides': subsample, - 'data_format': 'channels_first', - 'kernel_regularizer': 'l2', - 'bias_regularizer': 'l2', - 'activity_regularizer': 'l2'}, - input_shape=(num_samples, stack_size, kernel_dim1, kernel_dim2, kernel_dim3), tolerance=None) - - @keras_test def test_cosineconvolution_2d(): num_samples = 2 diff --git a/tests/keras_contrib/layers/test_normalization.py b/tests/keras_contrib/layers/test_normalization.py index 3321d09..1c74a04 100644 --- a/tests/keras_contrib/layers/test_normalization.py +++ b/tests/keras_contrib/layers/test_normalization.py @@ -25,9 +25,7 @@ def basic_instancenorm_test(): input_shape=(3, 4, 2)) layer_test(normalization.InstanceNormalization, kwargs={'gamma_initializer': 'ones', - 'beta_initializer': 'ones', - 'moving_mean_initializer': 'zeros', - 'moving_variance_initializer': 'ones'}, + 'beta_initializer': 'ones'}, input_shape=(3, 4, 2)) layer_test(normalization.InstanceNormalization, kwargs={'scale': False, 'center': False}, @@ -190,7 +188,7 @@ def test_instancenorm_perchannel_correctness(): for channel in range(3): activations = out[instance, channel] assert abs(activations.mean()) > 1e-2 - assert abs(activations.std() - 1.0) > 1e-2 + assert abs(activations.std() - 1.0) > 1e-6 # but values are still normalized per-instance activations = out[instance] @@ -229,10 +227,11 @@ def basic_batchrenorm_test(): @keras_test def test_batchrenorm_mode_0_or_2(): - for training in [1, 0]: - model = Sequential() - norm_m0 = normalization.BatchRenormalization(input_shape=(10,), momentum=0.8) - model.add(norm_m0) + for training in [1, 0, None]: + ip = Input(shape=(10,)) + norm_m0 = normalization.BatchRenormalization(momentum=0.8) + out = norm_m0(ip, training=training) + model = Model(ip, out) model.compile(loss='mse', optimizer='sgd') # centered on 5.0, variance 10.0 @@ -306,5 +305,37 @@ def test_shared_batchrenorm(): new_model.train_on_batch(x, x) +@keras_test +def test_batchrenorm_clipping_schedule(): + '''Test that the clipping schedule isn't fixed at r_max=1, d_max=0''' + inp = Input(shape=(10,)) + bn = normalization.BatchRenormalization(t_delta=1.) + out = bn(inp) + model = Model(inp, out) + model.compile('sgd', 'mse') + + x = np.random.normal(5, 10, size=(2, 10)) + y = np.random.normal(5, 10, size=(2, 10)) + + r_max, d_max = K.get_value(bn.r_max), K.get_value(bn.d_max) + assert r_max == 1 + assert d_max == 0 + + for i in range(10): + model.train_on_batch(x, y) + + r_max, d_max = K.get_value(bn.r_max), K.get_value(bn.d_max) + assert_allclose([r_max, d_max], [3, 5], atol=1e-1) + + +@keras_test +def test_batchrenorm_get_config(): + '''Test that get_config works on a model with a batchrenorm layer.''' + x = Input(shape=(10,)) + y = normalization.BatchRenormalization()(x) + model = Model(x, y) + model.get_config() + + if __name__ == '__main__': pytest.main([__file__]) diff --git a/tests/keras_contrib/utils/save_load_utils_test.py b/tests/keras_contrib/utils/save_load_utils_test.py index 67f55fc..a11e826 100644 --- a/tests/keras_contrib/utils/save_load_utils_test.py +++ b/tests/keras_contrib/utils/save_load_utils_test.py @@ -1,12 +1,16 @@ import pytest +import os from keras import backend as K from keras.layers import Input, Dense from keras.models import Model from numpy.testing import assert_allclose +from keras.utils.test_utils import keras_test from keras_contrib.utils.save_load_utils import save_all_weights, load_all_weights +@pytest.mark.skipif(K.backend() != 'tensorflow', reason='save_all_weights and load_all_weights only supported on TensorFlow') +@keras_test def test_save_and_load_all_weights(): ''' Test save_all_weights and load_all_weights. Save and load optimizer and model weights but not configuration. @@ -33,15 +37,16 @@ def test_save_and_load_all_weights(): ow1value[0, 0:3] = [4, 2, 0] K.set_value(ow1, ow1value) # save all weights - save_all_weights(m1, "model.h5") + save_all_weights(m1, 'model.h5') # new model m2 = make_model() # load all weights - load_all_weights(m2, "model.h5") + load_all_weights(m2, 'model.h5') # check weights assert_allclose(K.get_value(m2.layers[1].kernel)[0, 0:4], [1, 3, 3, 7]) # check optimizer weights assert_allclose(K.get_value(m2.optimizer.weights[3])[0, 0:3], [4, 2, 0]) + os.remove('model.h5') if __name__ == '__main__':