diff --git a/.travis.yml b/.travis.yml index 3edc3ee..be93c59 100644 --- a/.travis.yml +++ b/.travis.yml @@ -32,25 +32,39 @@ install: - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy matplotlib pandas pytest h5py - source activate test-environment - - pip install pytest-cov python-coveralls pytest-xdist coverage==3.7.1 #we need this version of coverage for coveralls.io to work + - pip install pytest-cov pytest-xdist - pip install pep8 pytest-pep8 + - conda install mkl mkl-service - pip install theano - pip install git+git://github.com/fchollet/keras.git # install PIL for preprocessing tests - #- if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then - # conda install pil; - # elif [[ "$TRAVIS_PYTHON_VERSION" == "3.5" ]]; then - # conda install Pillow; - # fi + - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then + conda install pil; + elif [[ "$TRAVIS_PYTHON_VERSION" == "3.5" ]]; then + conda install Pillow; + fi - - python setup.py install + - pip install -e .[tests] - # install TensorFlow (CPU) + # install TensorFlow (CPU version). - pip install tensorflow + + # install cntk + - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then + pip install https://cntk.ai/PythonWheel/CPU-Only/cntk-2.2-cp27-cp27mu-linux_x86_64.whl; + elif [[ "$TRAVIS_PYTHON_VERSION" == "3.5" ]]; then + pip install https://cntk.ai/PythonWheel/CPU-Only/cntk-2.2-cp35-cp35m-linux_x86_64.whl; + fi + + # install pydot for visualization tests + - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then + conda install pydot graphviz; + fi # command to run tests script: + - export MKL_THREADING_LAYER="GNU" # run keras backend init to initialize backend config - python -c "import keras.backend" # create dataset directory to avoid concurrent directory creation at runtime @@ -61,7 +75,5 @@ script: - if [[ "$TEST_MODE" == "PEP8" ]]; then PYTHONPATH=$PWD:$PYTHONPATH py.test --pep8 -m pep8 -n0; else - PYTHONPATH=$PWD:$PYTHONPATH py.test tests/; + PYTHONPATH=$PWD:$PYTHONPATH py.test tests/ --ignore=tests/integration_tests --ignore=tests/test_documentation.py --cov=keras tests/ --cov-report term-missing; fi -after_success: - - coveralls diff --git a/GUIDELINES.md b/GUIDELINES.md index bb95dcd..df0bed9 100644 --- a/GUIDELINES.md +++ b/GUIDELINES.md @@ -2,6 +2,7 @@ ## Maintainers: Following are the users with write-access to this repository (maintainers) : +* [athundt](https://www.github.com/athundt) * [bstriner](https://www.github.com/bstriner) * [farizrahman4u](https://www.github.com/farizrahman4u) * [fchollet](https://www.github.com/fchollet) diff --git a/examples/cifar10_nasnet.py b/examples/cifar10_nasnet.py new file mode 100644 index 0000000..56c75ee --- /dev/null +++ b/examples/cifar10_nasnet.py @@ -0,0 +1,106 @@ +""" +Adapted from keras example cifar10_cnn.py +Train NASNet-CIFAR on the CIFAR10 small images dataset. +""" +from __future__ import print_function +from keras.datasets import cifar10 +from keras.preprocessing.image import ImageDataGenerator +from keras.utils import np_utils +from keras.callbacks import ModelCheckpoint +from keras.callbacks import ReduceLROnPlateau +from keras.callbacks import CSVLogger +from keras.optimizers import Adam +from keras_contrib.applications.nasnet import NASNetCIFAR, preprocess_input + +import numpy as np + + +weights_file = 'NASNet-CIFAR-10.h5' +lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.5), cooldown=0, patience=5, min_lr=0.5e-5) +csv_logger = CSVLogger('NASNet-CIFAR-10.csv') +model_checkpoint = ModelCheckpoint(weights_file, monitor='val_predictions_acc', save_best_only=True, + save_weights_only=True, mode='max') + +batch_size = 128 +nb_classes = 10 +nb_epoch = 600 +data_augmentation = True + +# input image dimensions +img_rows, img_cols = 32, 32 +# The CIFAR10 images are RGB. +img_channels = 3 + +# The data, shuffled and split between train and test sets: +(X_train, y_train), (X_test, y_test) = cifar10.load_data() + +# Convert class vectors to binary class matrices. +Y_train = np_utils.to_categorical(y_train, nb_classes) +Y_test = np_utils.to_categorical(y_test, nb_classes) + +X_train = X_train.astype('float32') +X_test = X_test.astype('float32') + +# preprocess input +X_train = preprocess_input(X_train) +X_test = preprocess_input(X_test) + +# For training, the auxilary branch must be used to correctly train NASNet +model = NASNetCIFAR((img_rows, img_cols, img_channels), use_auxilary_branch=True) +model.summary() + +optimizer = Adam(lr=1e-3, clipnorm=5) +model.compile(loss=['categorical_crossentropy', 'categorical_crossentropy'], + optimizer=optimizer, metrics=['accuracy'], loss_weights=[1.0, 0.4]) + +# model.load_weights('NASNet-CIFAR-10.h5', by_name=True) + +if not data_augmentation: + print('Not using data augmentation.') + model.fit(X_train, [Y_train, Y_train], + batch_size=batch_size, + epochs=nb_epoch, + validation_data=(X_test, [Y_test, Y_test]), + shuffle=True, + verbose=2, + callbacks=[lr_reducer, csv_logger, model_checkpoint]) +else: + print('Using real-time data augmentation.') + # This will do preprocessing and realtime data augmentation: + datagen = ImageDataGenerator( + featurewise_center=False, # set input mean to 0 over the dataset + samplewise_center=False, # set each sample mean to 0 + featurewise_std_normalization=False, # divide inputs by std of the dataset + samplewise_std_normalization=False, # divide each input by its std + zca_whitening=False, # apply ZCA whitening + rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180) + width_shift_range=0.1, # randomly shift images horizontally (fraction of total width) + height_shift_range=0.1, # randomly shift images vertically (fraction of total height) + horizontal_flip=True, # randomly flip images + vertical_flip=False) # randomly flip images + + # Compute quantities required for featurewise normalization + # (std, mean, and principal components if ZCA whitening is applied). + datagen.fit(X_train) + + # wrap the ImageDataGenerator to yield two label batches [y, y] for each input batch X + # When training a NASNet model, we have to use its auxilary training head + # Therefore the model is technically a 1 input - 2 output model, and requires + # the label to be duplicated for the auxilary head + def image_data_generator_wrapper(image_datagenerator, batch_size): + iterator = datagen.flow(X_train, Y_train, batch_size=batch_size) + + while True: + X, y = next(iterator) # get the next batch + yield X, [y, y] # duplicate the labels for each batch + + # Fit the model on the batches generated by datagen.flow(). + model.fit_generator(image_data_generator_wrapper(datagen, batch_size), + steps_per_epoch=X_train.shape[0] // batch_size, + validation_data=(X_test, [Y_test, Y_test]), + epochs=nb_epoch, verbose=2, + callbacks=[lr_reducer, csv_logger, model_checkpoint]) + +scores = model.evaluate(X_test, [Y_test, Y_test], batch_size=batch_size) +for score, metric_name in zip(scores, model.metrics_names): + print("%s : %0.4f" % (metric_name, score)) diff --git a/examples/cifar10_resnet.py b/examples/cifar10_resnet.py new file mode 100644 index 0000000..edb6384 --- /dev/null +++ b/examples/cifar10_resnet.py @@ -0,0 +1,96 @@ +""" +Adapted from keras example cifar10_cnn.py and github.com/raghakot/keras-resnet +Train ResNet-18 on the CIFAR10 small images dataset. + +GPU run command with Theano backend (with TensorFlow, the GPU is automatically used): + THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python cifar10.py +""" +from __future__ import print_function +from keras.datasets import cifar10 +from keras.preprocessing.image import ImageDataGenerator +from keras.utils import np_utils +from keras.callbacks import ModelCheckpoint +from keras.callbacks import ReduceLROnPlateau +from keras.callbacks import CSVLogger +from keras.callbacks import EarlyStopping +from keras_contrib.applications.resnet import ResNet18 + +import numpy as np + + +weights_file = 'ResNet18v2-CIFAR-10.h5' +lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=5, min_lr=0.5e-6) +early_stopper = EarlyStopping(min_delta=0.001, patience=10) +csv_logger = CSVLogger('ResNet18v2-CIFAR-10.csv') +model_checkpoint = ModelCheckpoint(weights_file, monitor='val_acc', save_best_only=True, + save_weights_only=True, mode='auto') + +batch_size = 32 +nb_classes = 10 +nb_epoch = 200 +data_augmentation = True + +# input image dimensions +img_rows, img_cols = 32, 32 +# The CIFAR10 images are RGB. +img_channels = 3 + +# The data, shuffled and split between train and test sets: +(X_train, y_train), (X_test, y_test) = cifar10.load_data() + +# Convert class vectors to binary class matrices. +Y_train = np_utils.to_categorical(y_train, nb_classes) +Y_test = np_utils.to_categorical(y_test, nb_classes) + +X_train = X_train.astype('float32') +X_test = X_test.astype('float32') + +# subtract mean and normalize +mean_image = np.mean(X_train, axis=0) +X_train -= mean_image +X_test -= mean_image +X_train /= 128. +X_test /= 128. + +model = ResNet18((img_rows, img_cols, img_channels), nb_classes) +model.compile(loss='categorical_crossentropy', + optimizer='adam', + metrics=['accuracy']) + +if not data_augmentation: + print('Not using data augmentation.') + model.fit(X_train, Y_train, + batch_size=batch_size, + nb_epoch=nb_epoch, + validation_data=(X_test, Y_test), + shuffle=True, + callbacks=[lr_reducer, early_stopper, csv_logger, model_checkpoint]) +else: + print('Using real-time data augmentation.') + # This will do preprocessing and realtime data augmentation: + datagen = ImageDataGenerator( + featurewise_center=False, # set input mean to 0 over the dataset + samplewise_center=False, # set each sample mean to 0 + featurewise_std_normalization=False, # divide inputs by std of the dataset + samplewise_std_normalization=False, # divide each input by its std + zca_whitening=False, # apply ZCA whitening + rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180) + width_shift_range=0.1, # randomly shift images horizontally (fraction of total width) + height_shift_range=0.1, # randomly shift images vertically (fraction of total height) + horizontal_flip=True, # randomly flip images + vertical_flip=False) # randomly flip images + + # Compute quantities required for featurewise normalization + # (std, mean, and principal components if ZCA whitening is applied). + datagen.fit(X_train) + + # Fit the model on the batches generated by datagen.flow(). + model.fit_generator(datagen.flow(X_train, Y_train, batch_size=batch_size), + steps_per_epoch=X_train.shape[0] // batch_size, + validation_data=(X_test, Y_test), + epochs=nb_epoch, verbose=2, + callbacks=[lr_reducer, early_stopper, csv_logger, model_checkpoint]) + +scores = model.evaluate(X_test, Y_test, batch_size=batch_size) +print('Test loss : ', scores[0]) +print('Test accuracy : ', scores[1]) diff --git a/keras_contrib/applications/__init__.py b/keras_contrib/applications/__init__.py index e9d829d..a1592a7 100644 --- a/keras_contrib/applications/__init__.py +++ b/keras_contrib/applications/__init__.py @@ -1,2 +1,5 @@ from .densenet import DenseNet from .ror import ResidualOfResidual +from .resnet import ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152 +from .wide_resnet import WideResidualNetwork +from .nasnet import NASNet, NASNetLarge, NASNetMobile diff --git a/keras_contrib/applications/densenet.py b/keras_contrib/applications/densenet.py index e0556fc..4bb0337 100644 --- a/keras_contrib/applications/densenet.py +++ b/keras_contrib/applications/densenet.py @@ -506,7 +506,11 @@ def DenseNetImageNet161(input_shape=None, pooling=pooling, classes=classes, activation=activation) -def __conv_block(ip, nb_filter, bottleneck=False, dropout_rate=None, weight_decay=1e-4): +def name_or_none(prefix, name): + return prefix + name if (prefix is not None and name is not None) else None + + +def __conv_block(ip, nb_filter, bottleneck=False, dropout_rate=None, weight_decay=1e-4, block_prefix=None): ''' Adds a convolution layer (with batch normalization and relu), and optionally a bottleneck layer. @@ -518,6 +522,7 @@ def __conv_block(ip, nb_filter, bottleneck=False, dropout_rate=None, weight_deca bottleneck: if True, adds a bottleneck convolution block dropout_rate: dropout rate weight_decay: weight decay factor + block_prefix: str, for unique layer naming # Input shape 4D tensor with shape: @@ -538,18 +543,20 @@ def __conv_block(ip, nb_filter, bottleneck=False, dropout_rate=None, weight_deca with K.name_scope('ConvBlock'): concat_axis = 1 if K.image_data_format() == 'channels_first' else -1 - x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5)(ip) + x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name=name_or_none(block_prefix, '_bn'))(ip) x = Activation('relu')(x) if bottleneck: inter_channel = nb_filter * 4 x = Conv2D(inter_channel, (1, 1), kernel_initializer='he_normal', padding='same', use_bias=False, - kernel_regularizer=l2(weight_decay))(x) - x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5)(x) + kernel_regularizer=l2(weight_decay), name=name_or_none(block_prefix, '_bottleneck_conv2D'))(x) + x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, + name=name_or_none(block_prefix, '_bottleneck_bn'))(x) x = Activation('relu')(x) - x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_normal', padding='same', use_bias=False)(x) + x = Conv2D(nb_filter, (3, 3), kernel_initializer='he_normal', padding='same', use_bias=False, + name=name_or_none(block_prefix, '_conv2D'))(x) if dropout_rate: x = Dropout(dropout_rate)(x) @@ -557,7 +564,7 @@ def __conv_block(ip, nb_filter, bottleneck=False, dropout_rate=None, weight_deca def __dense_block(x, nb_layers, nb_filter, growth_rate, bottleneck=False, dropout_rate=None, - weight_decay=1e-4, grow_nb_filters=True, return_concat_list=False): + weight_decay=1e-4, grow_nb_filters=True, return_concat_list=False, block_prefix=None): ''' Build a dense_block where the output of each conv_block is fed to subsequent ones @@ -575,6 +582,7 @@ def __dense_block(x, nb_layers, nb_filter, growth_rate, bottleneck=False, dropou grow_nb_filters: if True, allows number of filters to grow return_concat_list: set to True to return the list of feature maps along with the actual output + block_prefix: str, for block unique naming # Return If return_concat_list is True, returns a list of the output @@ -590,7 +598,8 @@ def __dense_block(x, nb_layers, nb_filter, growth_rate, bottleneck=False, dropou x_list = [x] for i in range(nb_layers): - cb = __conv_block(x, growth_rate, bottleneck, dropout_rate, weight_decay) + cb = __conv_block(x, growth_rate, bottleneck, dropout_rate, weight_decay, + block_prefix=name_or_none(block_prefix, '_%i' % i)) x_list.append(cb) x = concatenate([x, cb], axis=concat_axis) @@ -604,7 +613,7 @@ def __dense_block(x, nb_layers, nb_filter, growth_rate, bottleneck=False, dropou return x, nb_filter -def __transition_block(ip, nb_filter, compression=1.0, weight_decay=1e-4): +def __transition_block(ip, nb_filter, compression=1.0, weight_decay=1e-4, block_prefix=None): ''' Adds a pointwise convolution layer (with batch normalization and relu), and an average pooling layer. The number of output convolution filters @@ -617,6 +626,7 @@ def __transition_block(ip, nb_filter, compression=1.0, weight_decay=1e-4): compression: calculated as 1 - reduction. Reduces the number of feature maps in the transition block. weight_decay: weight decay factor + block_prefix: str, for block unique naming # Input shape 4D tensor with shape: @@ -638,16 +648,16 @@ def __transition_block(ip, nb_filter, compression=1.0, weight_decay=1e-4): with K.name_scope('Transition'): concat_axis = 1 if K.image_data_format() == 'channels_first' else -1 - x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5)(ip) + x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name=name_or_none(block_prefix, '_bn'))(ip) x = Activation('relu')(x) x = Conv2D(int(nb_filter * compression), (1, 1), kernel_initializer='he_normal', padding='same', - use_bias=False, kernel_regularizer=l2(weight_decay))(x) + use_bias=False, kernel_regularizer=l2(weight_decay), name=name_or_none(block_prefix, '_conv2D'))(x) x = AveragePooling2D((2, 2), strides=(2, 2))(x) return x -def __transition_up_block(ip, nb_filters, type='deconv', weight_decay=1E-4): +def __transition_up_block(ip, nb_filters, type='deconv', weight_decay=1E-4, block_prefix=None): '''Adds an upsampling block. Upsampling operation relies on the the type parameter. # Arguments @@ -657,6 +667,7 @@ def __transition_up_block(ip, nb_filters, type='deconv', weight_decay=1E-4): type: can be 'upsampling', 'subpixel', 'deconv'. Determines type of upsampling performed weight_decay: weight decay factor + block_prefix: str, for block unique naming # Input shape 4D tensor with shape: @@ -676,17 +687,17 @@ def __transition_up_block(ip, nb_filters, type='deconv', weight_decay=1E-4): with K.name_scope('TransitionUp'): if type == 'upsampling': - x = UpSampling2D()(ip) + x = UpSampling2D(name=name_or_none(block_prefix, '_upsampling'))(ip) elif type == 'subpixel': x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay), - use_bias=False, kernel_initializer='he_normal')(ip) - x = SubPixelUpscaling(scale_factor=2)(x) + use_bias=False, kernel_initializer='he_normal', name=name_or_none(block_prefix, '_conv2D'))(ip) + x = SubPixelUpscaling(scale_factor=2, name=name_or_none(block_prefix, '_subpixel'))(x) x = Conv2D(nb_filters, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(weight_decay), - use_bias=False, kernel_initializer='he_normal')(x) + use_bias=False, kernel_initializer='he_normal', name=name_or_none(block_prefix, '_conv2D'))(x) else: x = Conv2DTranspose(nb_filters, (3, 3), activation='relu', padding='same', strides=(2, 2), - kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(ip) - + kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay), + name=name_or_none(block_prefix, '_conv2DT'))(ip) return x @@ -781,27 +792,30 @@ def __create_dense_net(nb_classes, img_input, include_top, depth=40, nb_dense_bl initial_kernel = (3, 3) initial_strides = (1, 1) - x = Conv2D(nb_filter, initial_kernel, kernel_initializer='he_normal', padding='same', + x = Conv2D(nb_filter, initial_kernel, kernel_initializer='he_normal', padding='same', name='initial_conv2D', strides=initial_strides, use_bias=False, kernel_regularizer=l2(weight_decay))(img_input) if subsample_initial_block: - x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5)(x) + x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name='initial_bn')(x) x = Activation('relu')(x) x = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x) # Add dense blocks for block_idx in range(nb_dense_block - 1): x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter, growth_rate, bottleneck=bottleneck, - dropout_rate=dropout_rate, weight_decay=weight_decay) + dropout_rate=dropout_rate, weight_decay=weight_decay, + block_prefix='dense_%i' % block_idx) # add transition_block - x = __transition_block(x, nb_filter, compression=compression, weight_decay=weight_decay) + x = __transition_block(x, nb_filter, compression=compression, weight_decay=weight_decay, + block_prefix='tr_%i' % block_idx) nb_filter = int(nb_filter * compression) # The last dense_block does not have a transition_block x, nb_filter = __dense_block(x, final_nb_layer, nb_filter, growth_rate, bottleneck=bottleneck, - dropout_rate=dropout_rate, weight_decay=weight_decay) + dropout_rate=dropout_rate, weight_decay=weight_decay, + block_prefix='dense_%i' % (nb_dense_block - 1)) - x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5)(x) + x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name='final_bn')(x) x = Activation('relu')(x) if include_top: @@ -889,7 +903,7 @@ def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5, # Initial convolution x = Conv2D(init_conv_filters, (7, 7), kernel_initializer='he_normal', padding='same', name='initial_conv2D', use_bias=False, kernel_regularizer=l2(weight_decay))(img_input) - x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5)(x) + x = BatchNormalization(axis=concat_axis, epsilon=1.1e-5, name='initial_bn')(x) x = Activation('relu')(x) nb_filter = init_conv_filters @@ -899,13 +913,14 @@ def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5, # Add dense blocks and transition down block for block_idx in range(nb_dense_block): x, nb_filter = __dense_block(x, nb_layers[block_idx], nb_filter, growth_rate, dropout_rate=dropout_rate, - weight_decay=weight_decay) + weight_decay=weight_decay, block_prefix='dense_%i' % block_idx) # Skip connection skip_list.append(x) # add transition_block - x = __transition_block(x, nb_filter, compression=compression, weight_decay=weight_decay) + x = __transition_block(x, nb_filter, compression=compression, weight_decay=weight_decay, + block_prefix='tr_%i' % block_idx) nb_filter = int(nb_filter * compression) # this is calculated inside transition_down_block @@ -913,7 +928,8 @@ def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5, # return the concatenated feature maps without the concatenation of the input _, nb_filter, concat_list = __dense_block(x, bottleneck_nb_layers, nb_filter, growth_rate, dropout_rate=dropout_rate, weight_decay=weight_decay, - return_concat_list=True) + return_concat_list=True, + block_prefix='dense_%i' % nb_dense_block) skip_list = skip_list[::-1] # reverse the skip list @@ -925,16 +941,18 @@ def __create_fcn_dense_net(nb_classes, img_input, include_top, nb_dense_block=5, # not the concatenation of the input with the feature maps (concat_list[0]. l = concatenate(concat_list[1:], axis=concat_axis) - t = __transition_up_block(l, nb_filters=n_filters_keep, type=upsampling_type, weight_decay=weight_decay) + t = __transition_up_block(l, nb_filters=n_filters_keep, type=upsampling_type, weight_decay=weight_decay, + block_prefix='tr_up_%i' % block_idx) # concatenate the skip connection with the transition block x = concatenate([t, skip_list[block_idx]], axis=concat_axis) # Dont allow the feature map size to grow in upsampling dense blocks - x_up, nb_filter, concat_list = __dense_block(x, nb_layers[nb_dense_block + block_idx + 1], nb_filter=growth_rate, - growth_rate=growth_rate, dropout_rate=dropout_rate, - weight_decay=weight_decay, return_concat_list=True, - grow_nb_filters=False) + x_up, nb_filter, concat_list = __dense_block(x, nb_layers[nb_dense_block + block_idx + 1], + nb_filter=growth_rate, growth_rate=growth_rate, + dropout_rate=dropout_rate, weight_decay=weight_decay, + return_concat_list=True, grow_nb_filters=False, + block_prefix='dense_%i' % (nb_dense_block + 1 + block_idx)) if include_top: x = Conv2D(nb_classes, (1, 1), activation='linear', padding='same', use_bias=False)(x_up) diff --git a/keras_contrib/applications/nasnet.py b/keras_contrib/applications/nasnet.py new file mode 100644 index 0000000..89ebb98 --- /dev/null +++ b/keras_contrib/applications/nasnet.py @@ -0,0 +1,773 @@ +"""Collection of NASNet models + +The reference paper: + - [Learning Transferable Architectures for Scalable Image Recognition] + (https://arxiv.org/abs/1707.07012) + +The reference implementation: +1. TF Slim + - https://github.com/tensorflow/models/blob/master/research/slim/nets/ + nasnet/nasnet.py +2. TensorNets + - https://github.com/taehoonlee/tensornets/blob/master/tensornets/nasnets.py +3. Weights + - https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet +""" +from __future__ import print_function +from __future__ import absolute_import +from __future__ import division + +import warnings + +from keras.models import Model +from keras.layers import Input +from keras.layers import Activation +from keras.layers import Dense +from keras.layers import Dropout +from keras.layers import BatchNormalization +from keras.layers import MaxPooling2D +from keras.layers import AveragePooling2D +from keras.layers import GlobalAveragePooling2D +from keras.layers import GlobalMaxPooling2D +from keras.layers import Conv2D +from keras.layers import SeparableConv2D +from keras.layers import ZeroPadding2D +from keras.layers import Cropping2D +from keras.layers import concatenate +from keras.layers import add +from keras.regularizers import l2 +from keras.utils.data_utils import get_file +from keras.engine.topology import get_source_inputs +from keras.applications.imagenet_utils import _obtain_input_shape +from keras.applications.inception_v3 import preprocess_input +from keras.applications.imagenet_utils import decode_predictions +from keras import backend as K + +_BN_DECAY = 0.9997 +_BN_EPSILON = 1e-3 + +NASNET_MOBILE_WEIGHT_PATH = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.0/NASNet-mobile.h5" +NASNET_MOBILE_WEIGHT_PATH_NO_TOP = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.0/NASNet-mobile-no-top.h5" +NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.0/NASNet-auxiliary-mobile.h5" +NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY_NO_TOP = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.0/NASNet-auxiliary-mobile-no-top.h5" +NASNET_LARGE_WEIGHT_PATH = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.1/NASNet-large.h5" +NASNET_LARGE_WEIGHT_PATH_NO_TOP = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.1/NASNet-large-no-top.h5" +NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.1/NASNet-auxiliary-large.h5" +NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary_NO_TOP = "https://github.com/titu1994/Keras-NASNet/releases/download/v1.1/NASNet-auxiliary-large-no-top.h5" + + +def NASNet(input_shape=None, + penultimate_filters=4032, + nb_blocks=6, + stem_filters=96, + skip_reduction=True, + use_auxiliary_branch=False, + filters_multiplier=2, + dropout=0.5, + weight_decay=5e-5, + include_top=True, + weights=None, + input_tensor=None, + pooling=None, + classes=1000, + default_size=None): + """Instantiates a NASNet architecture. + Note that only TensorFlow is supported for now, + therefore it only works with the data format + `image_data_format='channels_last'` in your Keras config + at `~/.keras/keras.json`. + + # Arguments + input_shape: optional shape tuple, only to be specified + if `include_top` is False (otherwise the input shape + has to be `(331, 331, 3)` for NASNetLarge or + `(224, 224, 3)` for NASNetMobile + It should have exactly 3 inputs channels, + and width and height should be no smaller than 32. + E.g. `(224, 224, 3)` would be one valid value. + penultimate_filters: number of filters in the penultimate layer. + NASNet models use the notation `NASNet (N @ P)`, where: + - N is the number of blocks + - P is the number of penultimate filters + nb_blocks: number of repeated blocks of the NASNet model. + NASNet models use the notation `NASNet (N @ P)`, where: + - N is the number of blocks + - P is the number of penultimate filters + stem_filters: number of filters in the initial stem block + skip_reduction: Whether to skip the reduction step at the tail + end of the network. Set to `False` for CIFAR models. + use_auxiliary_branch: Whether to use the auxiliary branch during + training or evaluation. + filters_multiplier: controls the width of the network. + - If `filters_multiplier` < 1.0, proportionally decreases the number + of filters in each layer. + - If `filters_multiplier` > 1.0, proportionally increases the number + of filters in each layer. + - If `filters_multiplier` = 1, default number of filters from the paper + are used at each layer. + dropout: dropout rate + weight_decay: l2 regularization weight + include_top: whether to include the fully-connected + layer at the top of the network. + weights: `None` (random initialization) or + `imagenet` (ImageNet weights) + input_tensor: optional Keras tensor (i.e. output of + `layers.Input()`) + to use as image input for the model. + pooling: Optional pooling mode for feature extraction + when `include_top` is `False`. + - `None` means that the output of the model + will be the 4D tensor output of the + last convolutional layer. + - `avg` means that global average pooling + will be applied to the output of the + last convolutional layer, and thus + the output of the model will be a + 2D tensor. + - `max` means that global max pooling will + be applied. + classes: optional number of classes to classify images + into, only to be specified if `include_top` is True, and + if no `weights` argument is specified. + default_size: specifies the default image size of the model + # Returns + A Keras model instance. + # Raises + ValueError: in case of invalid argument for `weights`, + or invalid input shape. + RuntimeError: If attempting to run this model with a + backend that does not support separable convolutions. + """ + if K.backend() != 'tensorflow': + raise RuntimeError('Only Tensorflow backend is currently supported, ' + 'as other backends do not support ' + 'separable convolution.') + + if weights not in {'imagenet', None}: + raise ValueError('The `weights` argument should be either ' + '`None` (random initialization) or `imagenet` ' + '(pre-training on ImageNet).') + + if weights == 'imagenet' and include_top and classes != 1000: + raise ValueError('If using `weights` as ImageNet with `include_top` ' + 'as true, `classes` should be 1000') + + if default_size is None: + default_size = 331 + + # Determine proper input shape and default size. + input_shape = _obtain_input_shape(input_shape, + default_size=default_size, + min_size=32, + data_format=K.image_data_format(), + require_flatten=include_top or weights) + + if K.image_data_format() != 'channels_last': + warnings.warn('The NASNet family of models is only available ' + 'for the input data format "channels_last" ' + '(width, height, channels). ' + 'However your settings specify the default ' + 'data format "channels_first" (channels, width, height).' + ' You should set `image_data_format="channels_last"` ' + 'in your Keras config located at ~/.keras/keras.json. ' + 'The model being returned right now will expect inputs ' + 'to follow the "channels_last" data format.') + K.set_image_data_format('channels_last') + old_data_format = 'channels_first' + else: + old_data_format = None + + if input_tensor is None: + img_input = Input(shape=input_shape) + else: + if not K.is_keras_tensor(input_tensor): + img_input = Input(tensor=input_tensor, shape=input_shape) + else: + img_input = input_tensor + + assert penultimate_filters % 24 == 0, "`penultimate_filters` needs to be divisible " \ + "by 24." + + channel_dim = 1 if K.image_data_format() == 'channels_first' else -1 + filters = penultimate_filters // 24 + + if not skip_reduction: + x = Conv2D(stem_filters, (3, 3), strides=(2, 2), padding='valid', use_bias=False, name='stem_conv1', + kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(img_input) + else: + x = Conv2D(stem_filters, (3, 3), strides=(1, 1), padding='same', use_bias=False, name='stem_conv1', + kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(img_input) + + x = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='stem_bn1')(x) + + p = None + if not skip_reduction: # imagenet / mobile mode + x, p = _reduction_A(x, p, filters // (filters_multiplier ** 2), weight_decay, id='stem_1') + x, p = _reduction_A(x, p, filters // filters_multiplier, weight_decay, id='stem_2') + + for i in range(nb_blocks): + x, p = _normal_A(x, p, filters, weight_decay, id='%d' % (i)) + + x, p0 = _reduction_A(x, p, filters * filters_multiplier, weight_decay, id='reduce_%d' % (nb_blocks)) + + p = p0 if not skip_reduction else p + + for i in range(nb_blocks): + x, p = _normal_A(x, p, filters * filters_multiplier, weight_decay, id='%d' % (nb_blocks + i + 1)) + + auxiliary_x = None + if not skip_reduction: # imagenet / mobile mode + if use_auxiliary_branch: + auxiliary_x = _add_auxiliary_head(x, classes, weight_decay) + + x, p0 = _reduction_A(x, p, filters * filters_multiplier ** 2, weight_decay, id='reduce_%d' % (2 * nb_blocks)) + + if skip_reduction: # CIFAR mode + if use_auxiliary_branch: + auxiliary_x = _add_auxiliary_head(x, classes, weight_decay) + + p = p0 if not skip_reduction else p + + for i in range(nb_blocks): + x, p = _normal_A(x, p, filters * filters_multiplier ** 2, weight_decay, id='%d' % (2 * nb_blocks + i + 1)) + + x = Activation('relu')(x) + + if include_top: + x = GlobalAveragePooling2D()(x) + x = Dropout(dropout)(x) + x = Dense(classes, activation='softmax', kernel_regularizer=l2(weight_decay), name='predictions')(x) + else: + if pooling == 'avg': + x = GlobalAveragePooling2D()(x) + elif pooling == 'max': + x = GlobalMaxPooling2D()(x) + + # Ensure that the model takes into account + # any potential predecessors of `input_tensor`. + if input_tensor is not None: + inputs = get_source_inputs(input_tensor) + else: + inputs = img_input + + # Create model. + if use_auxiliary_branch: + model = Model(inputs, [x, auxiliary_x], name='NASNet_with_auxiliary') + else: + model = Model(inputs, x, name='NASNet') + + # load weights + if weights == 'imagenet': + if default_size == 224: # mobile version + if include_top: + if use_auxiliary_branch: + weight_path = NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY + model_name = 'nasnet_mobile_with_aux.h5' + else: + weight_path = NASNET_MOBILE_WEIGHT_PATH + model_name = 'nasnet_mobile.h5' + else: + if use_auxiliary_branch: + weight_path = NASNET_MOBILE_WEIGHT_PATH_WITH_AUXULARY_NO_TOP + model_name = 'nasnet_mobile_with_aux_no_top.h5' + else: + weight_path = NASNET_MOBILE_WEIGHT_PATH_NO_TOP + model_name = 'nasnet_mobile_no_top.h5' + + weights_file = get_file(model_name, weight_path, cache_subdir='models') + model.load_weights(weights_file, by_name=True) + + elif default_size == 331: # large version + if include_top: + if use_auxiliary_branch: + weight_path = NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary + model_name = 'nasnet_large_with_aux.h5' + else: + weight_path = NASNET_LARGE_WEIGHT_PATH + model_name = 'nasnet_large.h5' + else: + if use_auxiliary_branch: + weight_path = NASNET_LARGE_WEIGHT_PATH_WITH_auxiliary_NO_TOP + model_name = 'nasnet_large_with_aux_no_top.h5' + else: + weight_path = NASNET_LARGE_WEIGHT_PATH_NO_TOP + model_name = 'nasnet_large_no_top.h5' + + weights_file = get_file(model_name, weight_path, cache_subdir='models') + model.load_weights(weights_file, by_name=True) + + else: + raise ValueError('ImageNet weights can only be loaded on NASNetLarge or NASNetMobile') + + if old_data_format: + K.set_image_data_format(old_data_format) + + return model + + +def NASNetLarge(input_shape=(331, 331, 3), + dropout=0.5, + weight_decay=5e-5, + use_auxiliary_branch=False, + include_top=True, + weights='imagenet', + input_tensor=None, + pooling=None, + classes=1000): + """Instantiates a NASNet architecture in ImageNet mode. + Note that only TensorFlow is supported for now, + therefore it only works with the data format + `image_data_format='channels_last'` in your Keras config + at `~/.keras/keras.json`. + + # Arguments + input_shape: optional shape tuple, only to be specified + if `include_top` is False (otherwise the input shape + has to be `(331, 331, 3)` for NASNetLarge. + It should have exactly 3 inputs channels, + and width and height should be no smaller than 32. + E.g. `(224, 224, 3)` would be one valid value. + use_auxiliary_branch: Whether to use the auxiliary branch during + training or evaluation. + dropout: dropout rate + weight_decay: l2 regularization weight + include_top: whether to include the fully-connected + layer at the top of the network. + weights: `None` (random initialization) or + `imagenet` (ImageNet weights) + input_tensor: optional Keras tensor (i.e. output of + `layers.Input()`) + to use as image input for the model. + pooling: Optional pooling mode for feature extraction + when `include_top` is `False`. + - `None` means that the output of the model + will be the 4D tensor output of the + last convolutional layer. + - `avg` means that global average pooling + will be applied to the output of the + last convolutional layer, and thus + the output of the model will be a + 2D tensor. + - `max` means that global max pooling will + be applied. + classes: optional number of classes to classify images + into, only to be specified if `include_top` is True, and + if no `weights` argument is specified. + default_size: specifies the default image size of the model + # Returns + A Keras model instance. + # Raises + ValueError: in case of invalid argument for `weights`, + or invalid input shape. + RuntimeError: If attempting to run this model with a + backend that does not support separable convolutions. + """ + global _BN_DECAY, _BN_EPSILON + _BN_DECAY = 0.9997 + _BN_EPSILON = 1e-3 + + return NASNet(input_shape, + penultimate_filters=4032, + nb_blocks=6, + stem_filters=96, + skip_reduction=False, + use_auxiliary_branch=use_auxiliary_branch, + filters_multiplier=2, + dropout=dropout, + weight_decay=weight_decay, + include_top=include_top, + weights=weights, + input_tensor=input_tensor, + pooling=pooling, + classes=classes, + default_size=331) + + +def NASNetMobile(input_shape=(224, 224, 3), + dropout=0.5, + weight_decay=4e-5, + use_auxiliary_branch=False, + include_top=True, + weights='imagenet', + input_tensor=None, + pooling=None, + classes=1000): + """Instantiates a NASNet architecture in Mobile ImageNet mode. + Note that only TensorFlow is supported for now, + therefore it only works with the data format + `image_data_format='channels_last'` in your Keras config + at `~/.keras/keras.json`. + + # Arguments + input_shape: optional shape tuple, only to be specified + if `include_top` is False (otherwise the input shape + has to be `(224, 224, 3)` for NASNetMobile + It should have exactly 3 inputs channels, + and width and height should be no smaller than 32. + E.g. `(224, 224, 3)` would be one valid value. + use_auxiliary_branch: Whether to use the auxiliary branch during + training or evaluation. + dropout: dropout rate + weight_decay: l2 regularization weight + include_top: whether to include the fully-connected + layer at the top of the network. + weights: `None` (random initialization) or + `imagenet` (ImageNet weights) + input_tensor: optional Keras tensor (i.e. output of + `layers.Input()`) + to use as image input for the model. + pooling: Optional pooling mode for feature extraction + when `include_top` is `False`. + - `None` means that the output of the model + will be the 4D tensor output of the + last convolutional layer. + - `avg` means that global average pooling + will be applied to the output of the + last convolutional layer, and thus + the output of the model will be a + 2D tensor. + - `max` means that global max pooling will + be applied. + classes: optional number of classes to classify images + into, only to be specified if `include_top` is True, and + if no `weights` argument is specified. + default_size: specifies the default image size of the model + # Returns + A Keras model instance. + # Raises + ValueError: in case of invalid argument for `weights`, + or invalid input shape. + RuntimeError: If attempting to run this model with a + backend that does not support separable convolutions. + """ + global _BN_DECAY, _BN_EPSILON + _BN_DECAY = 0.9997 + _BN_EPSILON = 1e-3 + + return NASNet(input_shape, + penultimate_filters=1056, + nb_blocks=4, + stem_filters=32, + skip_reduction=False, + use_auxiliary_branch=use_auxiliary_branch, + filters_multiplier=2, + dropout=dropout, + weight_decay=weight_decay, + include_top=include_top, + weights=weights, + input_tensor=input_tensor, + pooling=pooling, + classes=classes, + default_size=224) + + +def NASNetCIFAR(input_shape=(32, 32, 3), + dropout=0.0, + weight_decay=5e-4, + use_auxiliary_branch=False, + include_top=True, + weights=None, + input_tensor=None, + pooling=None, + classes=10): + """Instantiates a NASNet architecture in CIFAR mode. + Note that only TensorFlow is supported for now, + therefore it only works with the data format + `image_data_format='channels_last'` in your Keras config + at `~/.keras/keras.json`. + + # Arguments + input_shape: optional shape tuple, only to be specified + if `include_top` is False (otherwise the input shape + has to be `(32, 32, 3)` for NASNetMobile + It should have exactly 3 inputs channels, + and width and height should be no smaller than 32. + E.g. `(32, 32, 3)` would be one valid value. + use_auxiliary_branch: Whether to use the auxiliary branch during + training or evaluation. + dropout: dropout rate + weight_decay: l2 regularization weight + include_top: whether to include the fully-connected + layer at the top of the network. + weights: `None` (random initialization) or + `imagenet` (ImageNet weights) + input_tensor: optional Keras tensor (i.e. output of + `layers.Input()`) + to use as image input for the model. + pooling: Optional pooling mode for feature extraction + when `include_top` is `False`. + - `None` means that the output of the model + will be the 4D tensor output of the + last convolutional layer. + - `avg` means that global average pooling + will be applied to the output of the + last convolutional layer, and thus + the output of the model will be a + 2D tensor. + - `max` means that global max pooling will + be applied. + classes: optional number of classes to classify images + into, only to be specified if `include_top` is True, and + if no `weights` argument is specified. + default_size: specifies the default image size of the model + # Returns + A Keras model instance. + # Raises + ValueError: in case of invalid argument for `weights`, + or invalid input shape. + RuntimeError: If attempting to run this model with a + backend that does not support separable convolutions. + """ + global _BN_DECAY, _BN_EPSILON + _BN_DECAY = 0.9 + _BN_EPSILON = 1e-5 + + return NASNet(input_shape, + penultimate_filters=768, + nb_blocks=6, + stem_filters=32, + skip_reduction=True, + use_auxiliary_branch=use_auxiliary_branch, + filters_multiplier=2, + dropout=dropout, + weight_decay=weight_decay, + include_top=include_top, + weights=weights, + input_tensor=input_tensor, + pooling=pooling, + classes=classes, + default_size=224) + + +def _separable_conv_block(ip, filters, kernel_size=(3, 3), strides=(1, 1), weight_decay=5e-5, id=None): + '''Adds 2 blocks of [relu-separable conv-batchnorm] + + # Arguments: + ip: input tensor + filters: number of output filters per layer + kernel_size: kernel size of separable convolutions + strides: strided convolution for downsampling + weight_decay: l2 regularization weight + id: string id + + # Returns: + a Keras tensor + ''' + channel_dim = 1 if K.image_data_format() == 'channels_first' else -1 + + with K.name_scope('separable_conv_block_%s' % id): + x = Activation('relu')(ip) + x = SeparableConv2D(filters, kernel_size, strides=strides, name='separable_conv_1_%s' % id, + padding='same', use_bias=False, kernel_initializer='he_normal', + kernel_regularizer=l2(weight_decay))(x) + x = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name="separable_conv_1_bn_%s" % (id))(x) + x = Activation('relu')(x) + x = SeparableConv2D(filters, kernel_size, name='separable_conv_2_%s' % id, + padding='same', use_bias=False, kernel_initializer='he_normal', + kernel_regularizer=l2(weight_decay))(x) + x = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name="separable_conv_2_bn_%s" % (id))(x) + return x + + +def _adjust_block(p, ip, filters, weight_decay=5e-5, id=None): + ''' + Adjusts the input `p` to match the shape of the `input` + or situations where the output number of filters needs to + be changed + + # Arguments: + p: input tensor which needs to be modified + ip: input tensor whose shape needs to be matched + filters: number of output filters to be matched + weight_decay: l2 regularization weight + id: string id + + # Returns: + an adjusted Keras tensor + ''' + channel_dim = 1 if K.image_data_format() == 'channels_first' else -1 + img_dim = 2 if K.image_data_format() == 'channels_first' else -2 + + with K.name_scope('adjust_block'): + if p is None: + p = ip + + elif p._keras_shape[img_dim] != ip._keras_shape[img_dim]: + with K.name_scope('adjust_reduction_block_%s' % id): + p = Activation('relu', name='adjust_relu_1_%s' % id)(p) + + p1 = AveragePooling2D((1, 1), strides=(2, 2), padding='valid', name='adjust_avg_pool_1_%s' % id)(p) + p1 = Conv2D(filters // 2, (1, 1), padding='same', use_bias=False, kernel_regularizer=l2(weight_decay), + name='adjust_conv_1_%s' % id, kernel_initializer='he_normal')(p1) + + p2 = ZeroPadding2D(padding=((0, 1), (0, 1)))(p) + p2 = Cropping2D(cropping=((1, 0), (1, 0)))(p2) + p2 = AveragePooling2D((1, 1), strides=(2, 2), padding='valid', name='adjust_avg_pool_2_%s' % id)(p2) + p2 = Conv2D(filters // 2, (1, 1), padding='same', use_bias=False, kernel_regularizer=l2(weight_decay), + name='adjust_conv_2_%s' % id, kernel_initializer='he_normal')(p2) + + p = concatenate([p1, p2], axis=channel_dim) + p = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='adjust_bn_%s' % id)(p) + + elif p._keras_shape[channel_dim] != filters: + with K.name_scope('adjust_projection_block_%s' % id): + p = Activation('relu')(p) + p = Conv2D(filters, (1, 1), strides=(1, 1), padding='same', name='adjust_conv_projection_%s' % id, + use_bias=False, kernel_regularizer=l2(weight_decay), kernel_initializer='he_normal')(p) + p = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='adjust_bn_%s' % id)(p) + return p + + +def _normal_A(ip, p, filters, weight_decay=5e-5, id=None): + '''Adds a Normal cell for NASNet-A (Fig. 4 in the paper) + + # Arguments: + ip: input tensor `x` + p: input tensor `p` + filters: number of output filters + weight_decay: l2 regularization weight + id: string id + + # Returns: + a Keras tensor + ''' + channel_dim = 1 if K.image_data_format() == 'channels_first' else -1 + + with K.name_scope('normal_A_block_%s' % id): + p = _adjust_block(p, ip, filters, weight_decay, id) + + h = Activation('relu')(ip) + h = Conv2D(filters, (1, 1), strides=(1, 1), padding='same', name='normal_conv_1_%s' % id, + use_bias=False, kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(h) + h = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='normal_bn_1_%s' % id)(h) + + with K.name_scope('block_1'): + x1_1 = _separable_conv_block(h, filters, kernel_size=(5, 5), weight_decay=weight_decay, + id='normal_left1_%s' % id) + x1_2 = _separable_conv_block(p, filters, weight_decay=weight_decay, id='normal_right1_%s' % id) + x1 = add([x1_1, x1_2], name='normal_add_1_%s' % id) + + with K.name_scope('block_2'): + x2_1 = _separable_conv_block(p, filters, (5, 5), weight_decay=weight_decay, id='normal_left2_%s' % id) + x2_2 = _separable_conv_block(p, filters, (3, 3), weight_decay=weight_decay, id='normal_right2_%s' % id) + x2 = add([x2_1, x2_2], name='normal_add_2_%s' % id) + + with K.name_scope('block_3'): + x3 = AveragePooling2D((3, 3), strides=(1, 1), padding='same', name='normal_left3_%s' % (id))(h) + x3 = add([x3, p], name='normal_add_3_%s' % id) + + with K.name_scope('block_4'): + x4_1 = AveragePooling2D((3, 3), strides=(1, 1), padding='same', name='normal_left4_%s' % (id))(p) + x4_2 = AveragePooling2D((3, 3), strides=(1, 1), padding='same', name='normal_right4_%s' % (id))(p) + x4 = add([x4_1, x4_2], name='normal_add_4_%s' % id) + + with K.name_scope('block_5'): + x5 = _separable_conv_block(h, filters, weight_decay=weight_decay, id='normal_left5_%s' % id) + x5 = add([x5, h], name='normal_add_5_%s' % id) + + x = concatenate([p, x1, x2, x3, x4, x5], axis=channel_dim, name='normal_concat_%s' % id) + return x, ip + + +def _reduction_A(ip, p, filters, weight_decay=5e-5, id=None): + '''Adds a Reduction cell for NASNet-A (Fig. 4 in the paper) + + # Arguments: + ip: input tensor `x` + p: input tensor `p` + filters: number of output filters + weight_decay: l2 regularization weight + id: string id + + # Returns: + a Keras tensor + ''' + """""" + channel_dim = 1 if K.image_data_format() == 'channels_first' else -1 + + with K.name_scope('reduction_A_block_%s' % id): + p = _adjust_block(p, ip, filters, weight_decay, id) + + h = Activation('relu')(ip) + h = Conv2D(filters, (1, 1), strides=(1, 1), padding='same', name='reduction_conv_1_%s' % id, + use_bias=False, kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(h) + h = BatchNormalization(axis=channel_dim, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='reduction_bn_1_%s' % id)(h) + + with K.name_scope('block_1'): + x1_1 = _separable_conv_block(h, filters, (5, 5), strides=(2, 2), weight_decay=weight_decay, + id='reduction_left1_%s' % id) + x1_2 = _separable_conv_block(p, filters, (7, 7), strides=(2, 2), weight_decay=weight_decay, + id='reduction_1_%s' % id) + x1 = add([x1_1, x1_2], name='reduction_add_1_%s' % id) + + with K.name_scope('block_2'): + x2_1 = MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='reduction_left2_%s' % id)(h) + x2_2 = _separable_conv_block(p, filters, (7, 7), strides=(2, 2), weight_decay=weight_decay, + id='reduction_right2_%s' % id) + x2 = add([x2_1, x2_2], name='reduction_add_2_%s' % id) + + with K.name_scope('block_3'): + x3_1 = AveragePooling2D((3, 3), strides=(2, 2), padding='same', name='reduction_left3_%s' % id)(h) + x3_2 = _separable_conv_block(p, filters, (5, 5), strides=(2, 2), weight_decay=weight_decay, + id='reduction_right3_%s' % id) + x3 = add([x3_1, x3_2], name='reduction_add3_%s' % id) + + with K.name_scope('block_4'): + x4 = AveragePooling2D((3, 3), strides=(1, 1), padding='same', name='reduction_left4_%s' % id)(x1) + x4 = add([x2, x4]) + + with K.name_scope('block_5'): + x5_1 = _separable_conv_block(x1, filters, (3, 3), weight_decay=weight_decay, id='reduction_left4_%s' % id) + x5_2 = MaxPooling2D((3, 3), strides=(2, 2), padding='same', name='reduction_right5_%s' % id)(h) + x5 = add([x5_1, x5_2], name='reduction_add4_%s' % id) + + x = concatenate([x2, x3, x4, x5], axis=channel_dim, name='reduction_concat_%s' % id) + return x, ip + + +def _add_auxiliary_head(x, classes, weight_decay): + '''Adds an auxiliary head for training the model + + From section A.7 "Training of ImageNet models" of the paper, all NASNet models are + trained using an auxiliary classifier around 2/3 of the depth of the network, with + a loss weight of 0.4 + + # Arguments + x: input tensor + classes: number of output classes + weight_decay: l2 regularization weight + + # Returns + a keras Tensor + ''' + img_height = 1 if K.image_data_format() == 'channels_last' else 2 + img_width = 2 if K.image_data_format() == 'channels_last' else 3 + channel_axis = 1 if K.image_data_format() == 'channels_first' else -1 + + with K.name_scope('auxiliary_branch'): + auxiliary_x = Activation('relu')(x) + auxiliary_x = AveragePooling2D((5, 5), strides=(3, 3), padding='valid', name='aux_pool')(auxiliary_x) + auxiliary_x = Conv2D(128, (1, 1), padding='same', use_bias=False, name='aux_conv_projection', + kernel_initializer='he_normal', kernel_regularizer=l2(weight_decay))(auxiliary_x) + auxiliary_x = BatchNormalization(axis=channel_axis, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='aux_bn_projection')(auxiliary_x) + auxiliary_x = Activation('relu')(auxiliary_x) + + auxiliary_x = Conv2D(768, (auxiliary_x._keras_shape[img_height], auxiliary_x._keras_shape[img_width]), + padding='valid', use_bias=False, kernel_initializer='he_normal', + kernel_regularizer=l2(weight_decay), name='aux_conv_reduction')(auxiliary_x) + auxiliary_x = BatchNormalization(axis=channel_axis, momentum=_BN_DECAY, epsilon=_BN_EPSILON, + name='aux_bn_reduction')(auxiliary_x) + auxiliary_x = Activation('relu')(auxiliary_x) + + auxiliary_x = GlobalAveragePooling2D()(auxiliary_x) + auxiliary_x = Dense(classes, activation='softmax', kernel_regularizer=l2(weight_decay), + name='aux_predictions')(auxiliary_x) + return auxiliary_x diff --git a/keras_contrib/applications/resnet.py b/keras_contrib/applications/resnet.py new file mode 100644 index 0000000..743922c --- /dev/null +++ b/keras_contrib/applications/resnet.py @@ -0,0 +1,454 @@ +"""ResNet v1, v2, and segmentation models for Keras. + +# Reference + +- [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) +- [Identity Mappings in Deep Residual Networks](https://arxiv.org/abs/1603.05027) + +Reference material for extended functionality: + +- [ResNeXt](https://arxiv.org/abs/1611.05431) for Tiny ImageNet support. +- [Dilated Residual Networks](https://arxiv.org/pdf/1705.09914) for segmentation support. +- [Deep Residual Learning for Instrument Segmentation in Robotic Surgery](https://arxiv.org/abs/1703.08580) + for segmentation support. + +Implementation Adapted from: github.com/raghakot/keras-resnet +""" +from __future__ import division + +import six +from keras.models import Model +from keras.layers import Input +from keras.layers import Activation +from keras.layers import Reshape +from keras.layers import Dense +from keras.layers import Flatten +from keras.layers import Conv2D +from keras.layers import MaxPooling2D +from keras.layers import AveragePooling2D +from keras.layers.pooling import GlobalAveragePooling2D +from keras.layers import GlobalMaxPooling2D +from keras.layers import GlobalAveragePooling2D +from keras.layers import Dropout +from keras.layers.merge import add +from keras.layers.normalization import BatchNormalization +from keras.regularizers import l2 +from keras import backend as K +from keras.applications.imagenet_utils import _obtain_input_shape + + +def _bn_relu(x, bn_name=None, relu_name=None): + """Helper to build a BN -> relu block + """ + norm = BatchNormalization(axis=CHANNEL_AXIS, name=bn_name)(x) + return Activation("relu", name=relu_name)(norm) + + +def _conv_bn_relu(**conv_params): + """Helper to build a conv -> BN -> relu residual unit activation function. + This is the original ResNet v1 scheme in https://arxiv.org/abs/1512.03385 + """ + filters = conv_params["filters"] + kernel_size = conv_params["kernel_size"] + strides = conv_params.setdefault("strides", (1, 1)) + dilation_rate = conv_params.setdefault("dilation_rate", (1, 1)) + conv_name = conv_params.setdefault("conv_name", None) + bn_name = conv_params.setdefault("bn_name", None) + relu_name = conv_params.setdefault("relu_name", None) + kernel_initializer = conv_params.setdefault("kernel_initializer", "he_normal") + padding = conv_params.setdefault("padding", "same") + kernel_regularizer = conv_params.setdefault("kernel_regularizer", l2(1.e-4)) + + def f(x): + x = Conv2D(filters=filters, kernel_size=kernel_size, + strides=strides, padding=padding, + dilation_rate=dilation_rate, + kernel_initializer=kernel_initializer, + kernel_regularizer=kernel_regularizer, + name=conv_name)(x) + return _bn_relu(x, bn_name=bn_name, relu_name=relu_name) + + return f + + +def _bn_relu_conv(**conv_params): + """Helper to build a BN -> relu -> conv residual unit with full pre-activation function. + This is the ResNet v2 scheme proposed in http://arxiv.org/pdf/1603.05027v2.pdf + """ + filters = conv_params["filters"] + kernel_size = conv_params["kernel_size"] + strides = conv_params.setdefault("strides", (1, 1)) + dilation_rate = conv_params.setdefault("dilation_rate", (1, 1)) + conv_name = conv_params.setdefault("conv_name", None) + bn_name = conv_params.setdefault("bn_name", None) + relu_name = conv_params.setdefault("relu_name", None) + kernel_initializer = conv_params.setdefault("kernel_initializer", "he_normal") + padding = conv_params.setdefault("padding", "same") + kernel_regularizer = conv_params.setdefault("kernel_regularizer", l2(1.e-4)) + + def f(x): + activation = _bn_relu(x, bn_name=bn_name, relu_name=relu_name) + return Conv2D(filters=filters, kernel_size=kernel_size, + strides=strides, padding=padding, + dilation_rate=dilation_rate, + kernel_initializer=kernel_initializer, + kernel_regularizer=kernel_regularizer, + name=conv_name)(activation) + + return f + + +def _shortcut(input_feature, residual, conv_name_base=None, bn_name_base=None): + """Adds a shortcut between input and residual block and merges them with "sum" + """ + # Expand channels of shortcut to match residual. + # Stride appropriately to match residual (width, height) + # Should be int if network architecture is correctly configured. + input_shape = K.int_shape(input_feature) + residual_shape = K.int_shape(residual) + stride_width = int(round(input_shape[ROW_AXIS] / residual_shape[ROW_AXIS])) + stride_height = int(round(input_shape[COL_AXIS] / residual_shape[COL_AXIS])) + equal_channels = input_shape[CHANNEL_AXIS] == residual_shape[CHANNEL_AXIS] + + shortcut = input_feature + # 1 X 1 conv if shape is different. Else identity. + if stride_width > 1 or stride_height > 1 or not equal_channels: + print('reshaping via a convolution...') + if conv_name_base is not None: + conv_name_base = conv_name_base + '1' + shortcut = Conv2D(filters=residual_shape[CHANNEL_AXIS], + kernel_size=(1, 1), + strides=(stride_width, stride_height), + padding="valid", + kernel_initializer="he_normal", + kernel_regularizer=l2(0.0001), + name=conv_name_base)(input_feature) + if bn_name_base is not None: + bn_name_base = bn_name_base + '1' + shortcut = BatchNormalization(axis=CHANNEL_AXIS, name=bn_name_base)(shortcut) + + return add([shortcut, residual]) + + +def _residual_block(block_function, filters, blocks, stage, + transition_strides=None, transition_dilation_rates=None, + dilation_rates=(1, 1), is_first_layer=False, dropout=None, + residual_unit=_bn_relu_conv): + """Builds a residual block with repeating bottleneck blocks. + + stage: integer, current stage label, used for generating layer names + blocks: number of blocks 'a','b'..., current block label, used for generating layer names + transition_strides: a list of tuples for the strides of each transition + transition_dilation_rates: a list of tuples for the dilation rate of each transition + """ + if transition_dilation_rates is None: + transition_dilation_rates = [(1, 1)] * blocks + if transition_strides is None: + transition_strides = [(1, 1)] * blocks + + def f(x): + for i in range(blocks): + x = block_function(filters=filters, stage=stage, block=i, + transition_strides=transition_strides[i], + dilation_rate=dilation_rates[i], + is_first_block_of_first_layer=(is_first_layer and i == 0), + dropout=dropout, + residual_unit=residual_unit)(x) + return x + + return f + + +def _block_name_base(stage, block): + """Get the convolution name base and batch normalization name base defined by stage and block. + + If there are less than 26 blocks they will be labeled 'a', 'b', 'c' to match the paper and keras + and beyond 26 blocks they will simply be numbered. + """ + if block < 27: + block = '%c' % (block + 97) # 97 is the ascii number for lowercase 'a' + conv_name_base = 'res' + str(stage) + block + '_branch' + bn_name_base = 'bn' + str(stage) + block + '_branch' + return conv_name_base, bn_name_base + + +def basic_block(filters, stage, block, transition_strides=(1, 1), + dilation_rate=(1, 1), is_first_block_of_first_layer=False, dropout=None, + residual_unit=_bn_relu_conv): + """Basic 3 X 3 convolution blocks for use on resnets with layers <= 34. + Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf + """ + def f(input_features): + conv_name_base, bn_name_base = _block_name_base(stage, block) + if is_first_block_of_first_layer: + # don't repeat bn->relu since we just did bn->relu->maxpool + x = Conv2D(filters=filters, kernel_size=(3, 3), + strides=transition_strides, + dilation_rate=dilation_rate, + padding="same", + kernel_initializer="he_normal", + kernel_regularizer=l2(1e-4), + name=conv_name_base + '2a')(input_features) + else: + x = residual_unit(filters=filters, kernel_size=(3, 3), + strides=transition_strides, + dilation_rate=dilation_rate, + conv_name_base=conv_name_base + '2a', + bn_name_base=bn_name_base + '2a')(input_features) + + if dropout is not None: + x = Dropout(dropout)(x) + + x = residual_unit(filters=filters, kernel_size=(3, 3), + conv_name_base=conv_name_base + '2b', + bn_name_base=bn_name_base + '2b')(x) + + return _shortcut(input_features, x) + + return f + + +def bottleneck(filters, stage, block, transition_strides=(1, 1), + dilation_rate=(1, 1), is_first_block_of_first_layer=False, dropout=None, + residual_unit=_bn_relu_conv): + """Bottleneck architecture for > 34 layer resnet. + Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf + + Returns: + A final conv layer of filters * 4 + """ + def f(input_feature): + conv_name_base, bn_name_base = _block_name_base(stage, block) + if is_first_block_of_first_layer: + # don't repeat bn->relu since we just did bn->relu->maxpool + x = Conv2D(filters=filters, kernel_size=(1, 1), + strides=transition_strides, + dilation_rate=dilation_rate, + padding="same", + kernel_initializer="he_normal", + kernel_regularizer=l2(1e-4), + name=conv_name_base + '2a')(input_feature) + else: + x = residual_unit(filters=filters, kernel_size=(1, 1), + strides=transition_strides, + dilation_rate=dilation_rate, + conv_name_base=conv_name_base + '2a', + bn_name_base=bn_name_base + '2a')(input_feature) + + if dropout is not None: + x = Dropout(dropout)(x) + + x = residual_unit(filters=filters, kernel_size=(3, 3), + conv_name_base=conv_name_base + '2b', + bn_name_base=bn_name_base + '2b')(x) + + if dropout is not None: + x = Dropout(dropout)(x) + + x = residual_unit(filters=filters * 4, kernel_size=(1, 1), + conv_name_base=conv_name_base + '2c', + bn_name_base=bn_name_base + '2c')(x) + + return _shortcut(input_feature, x) + + return f + + +def _handle_dim_ordering(): + global ROW_AXIS + global COL_AXIS + global CHANNEL_AXIS + if K.image_data_format() == 'channels_last': + ROW_AXIS = 1 + COL_AXIS = 2 + CHANNEL_AXIS = 3 + else: + CHANNEL_AXIS = 1 + ROW_AXIS = 2 + COL_AXIS = 3 + + +def _string_to_function(identifier): + if isinstance(identifier, six.string_types): + res = globals().get(identifier) + if not res: + raise ValueError('Invalid {}'.format(identifier)) + return res + return identifier + + +def ResNet(input_shape=None, classes=10, block='bottleneck', residual_unit='v2', repetitions=None, + initial_filters=64, activation='softmax', include_top=True, input_tensor=None, dropout=None, + transition_dilation_rate=(1, 1), initial_strides=(2, 2), initial_kernel_size=(7, 7), + initial_pooling='max', final_pooling=None, top='classification'): + """Builds a custom ResNet like architecture. Defaults to ResNet50 v2. + + Args: + input_shape: optional shape tuple, only to be specified + if `include_top` is False (otherwise the input shape + has to be `(224, 224, 3)` (with `channels_last` dim ordering) + or `(3, 224, 224)` (with `channels_first` dim ordering). + It should have exactly 3 inputs channels, + and width and height should be no smaller than 8. + E.g. `(224, 224, 3)` would be one valid value. + classes: The number of outputs at final softmax layer + block: The block function to use. This is either `'basic'` or `'bottleneck'`. + The original paper used `basic` for layers < 50. + repetitions: Number of repetitions of various block units. + At each block unit, the number of filters are doubled and the input size is halved. + Default of None implies the ResNet50v2 values of [3, 4, 6, 3]. + transition_dilation_rate: Used for pixel-wise prediction tasks such as image segmentation. + residual_unit: the basic residual unit, 'v1' for conv bn relu, 'v2' for bn relu conv. + See [Identity Mappings in Deep Residual Networks](https://arxiv.org/abs/1603.05027) + for details. + dropout: None for no dropout, otherwise rate of dropout from 0 to 1. + Based on [Wide Residual Networks.(https://arxiv.org/pdf/1605.07146) paper. + transition_dilation_rate: Dilation rate for transition layers. For semantic + segmentation of images use a dilation rate of (2, 2). + initial_strides: Stride of the very first residual unit and MaxPooling2D call, + with default (2, 2), set to (1, 1) for small images like cifar. + initial_kernel_size: kernel size of the very first convolution, (7, 7) for imagenet + and (3, 3) for small image datasets like tiny imagenet and cifar. + See [ResNeXt](https://arxiv.org/abs/1611.05431) paper for details. + initial_pooling: Determine if there will be an initial pooling layer, + 'max' for imagenet and None for small image datasets. + See [ResNeXt](https://arxiv.org/abs/1611.05431) paper for details. + final_pooling: Optional pooling mode for feature extraction at the final model layer + when `include_top` is `False`. + - `None` means that the output of the model + will be the 4D tensor output of the + last convolutional layer. + - `avg` means that global average pooling + will be applied to the output of the + last convolutional layer, and thus + the output of the model will be a + 2D tensor. + - `max` means that global max pooling will + be applied. + top: Defines final layers to evaluate based on a specific problem type. Options are + 'classification' for ImageNet style problems, 'segmentation' for problems like + the Pascal VOC dataset, and None to exclude these layers entirely. + + Returns: + The keras `Model`. + """ + if activation not in ['softmax', 'sigmoid', None]: + raise ValueError('activation must be one of "softmax", "sigmoid", or None') + if activation == 'sigmoid' and classes != 1: + raise ValueError('sigmoid activation can only be used when classes = 1') + if repetitions is None: + repetitions = [3, 4, 6, 3] + # Determine proper input shape + input_shape = _obtain_input_shape(input_shape, + default_size=32, + min_size=8, + data_format=K.image_data_format(), + require_flatten=include_top) + _handle_dim_ordering() + if len(input_shape) != 3: + raise Exception("Input shape should be a tuple (nb_channels, nb_rows, nb_cols)") + + if block == 'basic': + block_fn = basic_block + elif block == 'bottleneck': + block_fn = bottleneck + elif isinstance(block, six.string_types): + block_fn = _string_to_function(block) + else: + block_fn = block + + if residual_unit == 'v2': + residual_unit = _bn_relu_conv + elif residual_unit == 'v1': + residual_unit = _conv_bn_relu + elif isinstance(residual_unit, six.string_types): + residual_unit = _string_to_function(residual_unit) + else: + residual_unit = residual_unit + + # Permute dimension order if necessary + if K.image_data_format() == 'channels_first': + input_shape = (input_shape[1], input_shape[2], input_shape[0]) + # Determine proper input shape + input_shape = _obtain_input_shape(input_shape, + default_size=32, + min_size=8, + data_format=K.image_data_format(), + require_flatten=include_top) + + img_input = Input(shape=input_shape, tensor=input_tensor) + x = _conv_bn_relu(filters=initial_filters, kernel_size=initial_kernel_size, strides=initial_strides)(img_input) + if initial_pooling == 'max': + x = MaxPooling2D(pool_size=(3, 3), strides=initial_strides, padding="same")(x) + + block = x + filters = initial_filters + for i, r in enumerate(repetitions): + transition_dilation_rates = [transition_dilation_rate] * r + transition_strides = [(1, 1)] * r + if transition_dilation_rate == (1, 1): + transition_strides[0] = (2, 2) + block = _residual_block(block_fn, filters=filters, + stage=i, blocks=r, + is_first_layer=(i == 0), + dropout=dropout, + transition_dilation_rates=transition_dilation_rates, + transition_strides=transition_strides, + residual_unit=residual_unit)(block) + filters *= 2 + + # Last activation + x = _bn_relu(block) + + # Classifier block + if include_top and top is 'classification': + x = GlobalAveragePooling2D()(x) + x = Dense(units=classes, activation=activation, kernel_initializer="he_normal")(x) + elif include_top and top is 'segmentation': + x = Conv2D(classes, (1, 1), activation='linear', padding='same')(x) + + if K.image_data_format() == 'channels_first': + channel, row, col = input_shape + else: + row, col, channel = input_shape + + x = Reshape((row * col, classes))(x) + x = Activation(activation)(x) + x = Reshape((row, col, classes))(x) + elif final_pooling == 'avg': + x = GlobalAveragePooling2D()(x) + elif final_pooling == 'max': + x = GlobalMaxPooling2D()(x) + + model = Model(inputs=img_input, outputs=x) + return model + + +def ResNet18(input_shape, classes): + """ResNet with 18 layers and v2 residual units + """ + return ResNet(input_shape, classes, basic_block, repetitions=[2, 2, 2, 2]) + + +def ResNet34(input_shape, classes): + """ResNet with 34 layers and v2 residual units + """ + return ResNet(input_shape, classes, basic_block, repetitions=[3, 4, 6, 3]) + + +def ResNet50(input_shape, classes): + """ResNet with 50 layers and v2 residual units + """ + return ResNet(input_shape, classes, bottleneck, repetitions=[3, 4, 6, 3]) + + +def ResNet101(input_shape, classes): + """ResNet with 101 layers and v2 residual units + """ + return ResNet(input_shape, classes, bottleneck, repetitions=[3, 4, 23, 3]) + + +def ResNet152(input_shape, classes): + """ResNet with 152 layers and v2 residual units + """ + return ResNet(input_shape, classes, bottleneck, repetitions=[3, 8, 36, 3]) diff --git a/keras_contrib/applications/wide_resnet.py b/keras_contrib/applications/wide_resnet.py index 647d63a..5df99f8 100644 --- a/keras_contrib/applications/wide_resnet.py +++ b/keras_contrib/applications/wide_resnet.py @@ -89,7 +89,7 @@ def WideResidualNetwork(depth=28, width=8, dropout_rate=0.0, default_size=32, min_size=8, data_format=K.image_dim_ordering(), - include_top=include_top) + require_flatten=include_top) if input_tensor is None: img_input = Input(shape=input_shape) diff --git a/keras_contrib/backend/cntk_backend.py b/keras_contrib/backend/cntk_backend.py index 363ad08..624aeee 100644 --- a/keras_contrib/backend/cntk_backend.py +++ b/keras_contrib/backend/cntk_backend.py @@ -1,2 +1,26 @@ from keras.backend import cntk_backend as KCN import cntk as C +import numpy as np + + +def clip(x, min_value, max_value): + """Element-wise value clipping. + + If min_value > max_value, clipping range is [min_value,min_value]. + + # Arguments + x: Tensor or variable. + min_value: Tensor, float, int, or None. + If min_value is None, defaults to -infinity. + max_value: Tensor, float, int, or None. + If max_value is None, defaults to infinity. + + # Returns + A tensor. + """ + if max_value is None: + max_value = np.inf + if min_value is None: + min_value = -np.inf + max_value = C.maximum(min_value, max_value) + return C.clip(x, min_value, max_value) diff --git a/keras_contrib/backend/tensorflow_backend.py b/keras_contrib/backend/tensorflow_backend.py index 7b69687..284cbe4 100644 --- a/keras_contrib/backend/tensorflow_backend.py +++ b/keras_contrib/backend/tensorflow_backend.py @@ -1,28 +1,71 @@ import tensorflow as tf +import numpy as np try: from tensorflow.python.ops import ctc_ops as ctc except ImportError: import tensorflow.contrib.ctc as ctc from keras.backend import tensorflow_backend as KTF -from keras.backend.common import floatx, image_data_format -from keras.backend.tensorflow_backend import _preprocess_conv3d_input -from keras.backend.tensorflow_backend import _postprocess_conv3d_output -from keras.backend.tensorflow_backend import _preprocess_padding -from keras.backend.tensorflow_backend import _preprocess_conv2d_input -from keras.backend.tensorflow_backend import _postprocess_conv2d_output +from keras.backend import dtype +from keras.backend.common import floatx +from keras.backend.common import image_data_format +from keras.backend.tensorflow_backend import _to_tensor py_all = all -def _preprocess_deconv_output_shape(x, shape, data_format): +def _preprocess_conv2d_input(x, data_format): + """Transpose and cast the input before the conv2d. + # Arguments + x: input tensor. + data_format: string, `"channels_last"` or `"channels_first"`. + # Returns + A tensor. + """ + if dtype(x) == 'float64': + x = tf.cast(x, 'float32') if data_format == 'channels_first': - shape = (shape[0],) + tuple(shape[2:]) + (shape[1],) + # TF uses the last dimension as channel dimension, + # instead of the 2nd one. + # TH input shape: (samples, input_depth, rows, cols) + # TF input shape: (samples, rows, cols, input_depth) + x = tf.transpose(x, (0, 2, 3, 1)) + return x - if shape[0] is None: - shape = (tf.shape(x)[0],) + tuple(shape[1:]) - shape = tf.stack(list(shape)) - return shape + +def _postprocess_conv2d_output(x, data_format): + """Transpose and cast the output from conv2d if needed. + # Arguments + x: A tensor. + data_format: string, `"channels_last"` or `"channels_first"`. + # Returns + A tensor. + """ + + if data_format == 'channels_first': + x = tf.transpose(x, (0, 3, 1, 2)) + + if floatx() == 'float64': + x = tf.cast(x, 'float64') + return x + + +def _preprocess_padding(padding): + """Convert keras' padding to tensorflow's padding. + # Arguments + padding: string, `"same"` or `"valid"`. + # Returns + a string, `"SAME"` or `"VALID"`. + # Raises + ValueError: if `padding` is invalid. + """ + if padding == 'same': + padding = 'SAME' + elif padding == 'valid': + padding = 'VALID' + else: + raise ValueError('Invalid padding:', padding) + return padding def conv2d(x, kernel, strides=(1, 1), padding='valid', data_format='channels_first', @@ -70,45 +113,6 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid', data_format='channels_fir return x -def deconv3d(x, kernel, output_shape, strides=(1, 1, 1), - padding='valid', - data_format='default', - image_shape=None, filter_shape=None): - '''3D deconvolution (i.e. transposed convolution). - - # Arguments - x: input tensor. - kernel: kernel tensor. - output_shape: 1D int tensor for the output shape. - strides: strides tuple. - padding: string, "same" or "valid". - data_format: "tf" or "th". - Whether to use Theano or TensorFlow dimension ordering - for inputs/kernels/ouputs. - - # Returns - A tensor, result of transposed 3D convolution. - - # Raises - ValueError: if `data_format` is neither `tf` or `th`. - ''' - if data_format == 'default': - data_format = image_data_format() - if data_format not in {'channels_first', 'channels_last'}: - raise ValueError('Unknown data_format ' + str(data_format)) - - x = _preprocess_conv3d_input(x, data_format) - output_shape = _preprocess_deconv_output_shape(x, output_shape, - data_format) - kernel = tf.transpose(kernel, (0, 1, 2, 4, 3)) - padding = _preprocess_padding(padding) - strides = (1,) + strides + (1,) - - x = tf.nn.conv3d_transpose(x, kernel, output_shape, strides, - padding=padding) - return _postprocess_conv3d_output(x, data_format) - - def extract_image_patches(x, ksizes, ssizes, padding='same', data_format='channels_last'): ''' @@ -158,3 +162,28 @@ def moments(x, axes, shift=None, keep_dims=False): ''' Wrapper over tensorflow backend call ''' return tf.nn.moments(x, axes, shift=shift, keep_dims=keep_dims) + + +def clip(x, min_value, max_value): + """Element-wise value clipping. + + If min_value > max_value, clipping range is [min_value,min_value]. + + # Arguments + x: Tensor or variable. + min_value: Tensor, float, int, or None. + If min_value is None, defaults to -infinity. + max_value: Tensor, float, int, or None. + If max_value is None, defaults to infinity. + + # Returns + A tensor. + """ + if max_value is None: + max_value = np.inf + if min_value is None: + min_value = -np.inf + min_value = _to_tensor(min_value, x.dtype.base_dtype) + max_value = _to_tensor(max_value, x.dtype.base_dtype) + max_value = tf.maximum(min_value, max_value) + return tf.clip_by_value(x, min_value, max_value) diff --git a/keras_contrib/backend/theano_backend.py b/keras_contrib/backend/theano_backend.py index 2b5adaf..78af0ef 100644 --- a/keras_contrib/backend/theano_backend.py +++ b/keras_contrib/backend/theano_backend.py @@ -1,5 +1,6 @@ from theano import tensor as T from theano.sandbox.neighbours import images2neibs +import numpy as np try: import theano.sparse as th_sparse_module @@ -85,56 +86,6 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid', data_format='channels_fir return conv_out -def deconv3d(x, kernel, output_shape, strides=(1, 1, 1), - padding='valid', - data_format=None, filter_shape=None): - '''3D deconvolution (transposed convolution). - - # Arguments - kernel: kernel tensor. - output_shape: desired dimensions of output. - strides: strides tuple. - padding: string, "same" or "valid". - data_format: "channels_last" or "channels_first". - Whether to use Theano or TensorFlow dimension ordering - in inputs/kernels/ouputs. - ''' - flip_filters = False - if data_format is None: - data_format = image_data_format() - if data_format not in {'channels_first', 'channels_last'}: - raise ValueError('Unknown data_format: ' + str(data_format)) - - if data_format == 'channels_last': - output_shape = (output_shape[0], output_shape[4], output_shape[1], - output_shape[2], output_shape[3]) - - x = _preprocess_conv3d_input(x, data_format) - kernel = _preprocess_conv3d_kernel(kernel, data_format) - kernel = kernel.dimshuffle((1, 0, 2, 3, 4)) - th_padding = _preprocess_padding(padding) - - if hasattr(kernel, '_keras_shape'): - kernel_shape = kernel._keras_shape - else: - # Will only work if `kernel` is a shared variable. - kernel_shape = kernel.eval().shape - - filter_shape = _preprocess_conv3d_filter_shape(filter_shape, data_format) - filter_shape = tuple(filter_shape[i] for i in (1, 0, 2, 3, 4)) - - conv_out = T.nnet.abstract_conv.conv3d_grad_wrt_inputs( - x, kernel, output_shape, - filter_shape=filter_shape, - border_mode=th_padding, - subsample=strides, - filter_flip=not flip_filters) - - conv_out = _postprocess_conv3d_output(conv_out, x, padding, - kernel_shape, strides, data_format) - return conv_out - - def extract_image_patches(X, ksizes, strides, padding='valid', data_format='channels_first'): ''' Extract the patches from an image @@ -197,3 +148,26 @@ def moments(x, axes, shift=None, keep_dims=False): var_batch = KTH.var(x, axis=axes, keepdims=keep_dims) return mean_batch, var_batch + + +def clip(x, min_value, max_value): + """Element-wise value clipping. + + If min_value > max_value, clipping range is [min_value,min_value]. + + # Arguments + x: Tensor or variable. + min_value: Tensor, float, int, or None. + If min_value is None, defaults to -infinity. + max_value: Tensor, float, int, or None. + If max_value is None, defaults to infinity. + + # Returns + A tensor. + """ + if max_value is None: + max_value = np.inf + if min_value is None: + min_value = -np.inf + max_value = T.maximum(min_value, max_value) + return T.clip(x, min_value, max_value) diff --git a/keras_contrib/callbacks/dead_relu_detector.py b/keras_contrib/callbacks/dead_relu_detector.py index 2019f56..2cfe37b 100644 --- a/keras_contrib/callbacks/dead_relu_detector.py +++ b/keras_contrib/callbacks/dead_relu_detector.py @@ -1,8 +1,6 @@ import numpy as np -import warnings from keras.callbacks import Callback -from keras.layers import Dense from keras import backend as K @@ -13,10 +11,11 @@ class DeadReluDetector(Callback): # Arguments x_train: Training dataset to check whether or not neurons fire verbose: verbosity mode - True means that even a single dead neuron triggers warning + True means that even a single dead neuron triggers a warning message False means that only significant number of dead neurons (10% or more) - triggers warning + triggers a warning message """ + def __init__(self, x_train, verbose=False): super(DeadReluDetector, self).__init__() self.x_train = x_train @@ -25,7 +24,8 @@ class DeadReluDetector(Callback): @staticmethod def is_relu_layer(layer): - return isinstance(layer, Dense) and layer.get_config()['activation'] == 'relu' + # Should work for all layers with relu activation. Tested for Dense and Conv2D + return 'activation' in layer.get_config() and layer.get_config()['activation'] == 'relu' def get_relu_activations(self): model_input = self.model.input @@ -44,17 +44,43 @@ class DeadReluDetector(Callback): layer_outputs = [func(list_inputs)[0] for func in funcs] for layer_index, layer_activations in enumerate(layer_outputs): if self.is_relu_layer(self.model.layers[layer_index]): - yield [layer_index, layer_activations] + layer_name = self.model.layers[layer_index].name + # layer_weight is a list [W] (+ [b]) + layer_weight = self.model.layers[layer_index].get_weights() + # with kernel and bias, the weights are saved as a list [W, b]. If only weights, it is [W] + if type(layer_weight) is not list: + raise ValueError("'Layer_weight' should be a list, but was {}".format(type(layer_weight))) + + layer_weight_shape = np.shape(layer_weight[0]) + yield [layer_index, layer_activations, layer_name, layer_weight_shape] def on_epoch_end(self, epoch, logs={}): for relu_activation in self.get_relu_activations(): - layer_index, activation_values = relu_activation - total_neurons = activation_values.shape[-1] - dead_neurons = np.sum(activation_values == 0) - dead_neurons_share = dead_neurons / total_neurons - if (self.verbose and dead_neurons > 0) or dead_neurons_share > self.dead_neurons_share_threshold: - warnings.warn( - 'Layer #{} has {} dead neurons ({:.2%})!' - .format(layer_index, dead_neurons, dead_neurons_share), - RuntimeWarning - ) + layer_index, activation_values, layer_name, layer_weight_shape = relu_activation + + shape_act = activation_values.shape + + weight_len = len(layer_weight_shape) + act_len = len(shape_act) + + # should work for both Conv and Flat + if K.image_data_format() == 'channels_last': + # features in last axis + axis_filter = -1 + else: + # features before the convolution axis, for weight_len the input and output have to be subtracted + axis_filter = -1 - (weight_len - 2) + + total_featuremaps = shape_act[axis_filter] + + axis = tuple( + i for i in range(act_len) if (i != axis_filter) and (i != (len(shape_act) + axis_filter))) + + dead_neurons = np.sum(np.sum(activation_values, axis=axis) == 0) + + dead_neurons_share = float(dead_neurons) / float(total_featuremaps) + if (self.verbose and dead_neurons > 0) or dead_neurons_share >= self.dead_neurons_share_threshold: + str_warning = 'Layer {} (#{}) has {} dead neurons ({:.2%})!'.format(layer_name, layer_index, + dead_neurons, dead_neurons_share) + + print(str_warning) diff --git a/keras_contrib/datasets/conll2000.py b/keras_contrib/datasets/conll2000.py old mode 100644 new mode 100755 index 22a97e1..5561f17 --- a/keras_contrib/datasets/conll2000.py +++ b/keras_contrib/datasets/conll2000.py @@ -16,7 +16,7 @@ def load_data(path='conll2000.zip', min_freq=2): archive.close() word_counts = Counter(row[0].lower() for sample in train for row in sample) - vocab = ['', ''] + [w for w, f in word_counts.iteritems() if f >= min_freq] + vocab = ['', ''] + [w for w, f in iter(word_counts.items()) if f >= min_freq] pos_tags = sorted(list(set(row[1] for sample in train + test for row in sample))) # in alphabetic order chunk_tags = sorted(list(set(row[2] for sample in train + test for row in sample))) # in alphabetic order @@ -27,7 +27,7 @@ def load_data(path='conll2000.zip', min_freq=2): def _parse_data(fh): string = fh.read() - data = [[row.split() for row in sample.split('\n')] for sample in string.strip().split('\n\n')] + data = [[row.split() for row in sample.split('\n')] for sample in string.decode().strip().split('\n\n')] fh.close() return data diff --git a/keras_contrib/layers/advanced_activations.py b/keras_contrib/layers/advanced_activations.py index 179856c..7bf349e 100644 --- a/keras_contrib/layers/advanced_activations.py +++ b/keras_contrib/layers/advanced_activations.py @@ -236,3 +236,50 @@ class SReLU(Layer): return dict(list(base_config.items()) + list(config.items())) get_custom_objects().update({'SReLU': SReLU}) + + +class Swish(Layer): + """ Swish (Ramachandranet al., 2017) + + # Input shape + Arbitrary. Use the keyword argument `input_shape` + (tuple of integers, does not include the samples axis) + when using this layer as the first layer in a model. + + # Output shape + Same shape as the input. + + # Arguments + beta: float >= 0. Scaling factor + if set to 1 and trainable set to False (default), Swish equals the SiLU activation (Elfwing et al., 2017) + trainable: whether to learn the scaling factor during training or not + + # References + - [Searching for Activation Functions](https://arxiv.org/abs/1710.05941) + - [Sigmoid-weighted linear units for neural network function approximation in reinforcement learning](https://arxiv.org/abs/1702.03118) + """ + + def __init__(self, beta=1.0, trainable=False, **kwargs): + super(Swish, self).__init__(**kwargs) + self.supports_masking = True + self.beta = beta + self.trainable = trainable + + def build(self, input_shape): + self.scaling_factor = K.variable(self.beta, + dtype=K.floatx(), + name='scaling_factor') + if self.trainable: + self._trainable_weights.append(self.scaling_factor) + super(Swish, self).build(input_shape) + + def call(self, inputs, mask=None): + return inputs * K.sigmoid(self.scaling_factor * inputs) + + def get_config(self): + config = {'beta': self.get_weights()[0] if self.trainable else self.beta, + 'trainable': self.trainable} + base_config = super(Swish, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + +get_custom_objects().update({'Swish': Swish}) diff --git a/keras_contrib/layers/convolutional.py b/keras_contrib/layers/convolutional.py index c60df62..0899309 100644 --- a/keras_contrib/layers/convolutional.py +++ b/keras_contrib/layers/convolutional.py @@ -16,220 +16,6 @@ from keras.utils.conv_utils import normalize_data_format import numpy as np -class Deconvolution3D(Convolution3D): - """Transposed convolution operator for filtering windows of 3-D inputs. - - The need for transposed convolutions generally arises from the desire to - use a transformation going in the opposite direction - of a normal convolution, i.e., from something that has the shape - of the output of some convolution to something that has the shape - of its input while maintaining a connectivity pattern - that is compatible with said convolution. - - When using this layer as the first layer in a model, - provide the keyword argument `input_shape` - (tuple of integers, does not include the sample axis), - e.g. `input_shape=(3, 128, 128, 128)` for a 128x128x128 volume with - three channels. - - To pass the correct `output_shape` to this layer, - one could use a test model to predict and observe the actual output shape. - - # Examples - - ```python - # TH dim ordering. - # apply a 3x3x3 transposed convolution - # with stride 1x1x1 and 3 output filters on a 12x12x12 image: - model = Sequential() - model.add(Deconvolution3D(3, 3, 3, 3, output_shape=(None, 3, 14, 14, 14), - padding='valid', - input_shape=(3, 12, 12, 12))) - - # we can predict with the model and print the shape of the array. - dummy_input = np.ones((32, 3, 12, 12, 12)) - preds = model.predict(dummy_input) - print(preds.shape) # (None, 3, 14, 14, 14) - - # apply a 3x3x3 transposed convolution - # with stride 2x2x2 and 3 output filters on a 12x12x12 image: - model = Sequential() - model.add(Deconvolution3D(3, 3, 3, 3, output_shape=(None, 3, 25, 25, 25), - strides=(2, 2, 2), - padding='valid', - input_shape=(3, 12, 12, 12))) - model.summary() - - # we can predict with the model and print the shape of the array. - dummy_input = np.ones((32, 3, 12, 12, 12)) - preds = model.predict(dummy_input) - print(preds.shape) # (None, 3, 25, 25, 25) - ``` - - ```python - # TF dim ordering. - # apply a 3x3x3 transposed convolution - # with stride 1x1x1 and 3 output filters on a 12x12x12 image: - model = Sequential() - model.add(Deconvolution3D(3, 3, 3, 3, output_shape=(None, 14, 14, 14, 3), - padding='valid', - input_shape=(12, 12, 12, 3))) - - # we can predict with the model and print the shape of the array. - dummy_input = np.ones((32, 12, 12, 12, 3)) - preds = model.predict(dummy_input) - print(preds.shape) # (None, 14, 14, 14, 3) - - # apply a 3x3x3 transposed convolution - # with stride 2x2x2 and 3 output filters on a 12x12x12 image: - model = Sequential() - model.add(Deconvolution3D(3, 3, 3, 3, output_shape=(None, 25, 25, 25, 3), - strides=(2, 2, 2), - padding='valid', - input_shape=(12, 12, 12, 3))) - model.summary() - - # we can predict with the model and print the shape of the array. - dummy_input = np.ones((32, 12, 12, 12, 3)) - preds = model.predict(dummy_input) - print(preds.shape) # (None, 25, 25, 25, 3) - ``` - - # Arguments - filters: Number of transposed convolution filters to use. - kernel_size: kernel_size: An integer or tuple/list of 3 integers, specifying the - dimensions of the convolution window. - output_shape: Output shape of the transposed convolution operation. - tuple of integers - `(nb_samples, filters, conv_dim1, conv_dim2, conv_dim3)`. - It is better to use - a dummy input and observe the actual output shape of - a layer, as specified in the examples. - init: name of initialization function for the weights of the layer - (see [initializers](../initializers.md)), or alternatively, - Theano function to use for weights initialization. - This parameter is only relevant if you don't pass - a `weights` argument. - activation: name of activation function to use - (see [activations](../activations.md)), - or alternatively, elementwise Theano/TensorFlow function. - If you don't specify anything, no activation is applied - (ie. "linear" activation: a(x) = x). - weights: list of numpy arrays to set as initial weights. - padding: 'valid', 'same' or 'full' - ('full' requires the Theano backend). - strides: tuple of length 3. Factor by which to oversample output. - Also called strides elsewhere. - kernel_regularizer: instance of [WeightRegularizer](../regularizers.md) - (eg. L1 or L2 regularization), applied to the main weights matrix. - bias_regularizer: instance of [WeightRegularizer](../regularizers.md), - applied to the use_bias. - activity_regularizer: instance of [ActivityRegularizer](../regularizers.md), - applied to the network output. - kernel_constraint: instance of the [constraints](../constraints.md) module - (eg. maxnorm, nonneg), applied to the main weights matrix. - bias_constraint: instance of the [constraints](../constraints.md) module, - applied to the use_bias. - data_format: 'channels_first' or 'channels_last'. In 'channels_first' mode, the channels dimension - (the depth) is at index 1, in 'channels_last' mode is it at index 4. - It defaults to the `image_data_format` value found in your - Keras config file at `~/.keras/keras.json`. - If you never set it, then it will be "tf". - use_bias: whether to include a use_bias - (i.e. make the layer affine rather than linear). - - # Input shape - 5D tensor with shape: - `(samples, channels, conv_dim1, conv_dim2, conv_dim3)` if data_format='channels_first' - or 5D tensor with shape: - `(samples, conv_dim1, conv_dim2, conv_dim3, channels)` if data_format='channels_last'. - - # Output shape - 5D tensor with shape: - `(samples, filters, nekernel_conv_dim1, nekernel_conv_dim2, nekernel_conv_dim3)` if data_format='channels_first' - or 5D tensor with shape: - `(samples, nekernel_conv_dim1, nekernel_conv_dim2, nekernel_conv_dim3, filters)` if data_format='channels_last'. - `nekernel_conv_dim1`, `nekernel_conv_dim2` and `nekernel_conv_dim3` values might have changed due to padding. - - # References - - [A guide to convolution arithmetic for deep learning](https://arxiv.org/abs/1603.07285v1) - - [Transposed convolution arithmetic](http://deeplearning.net/software/theano_versions/dev/tutorial/conv_arithmetic.html#transposed-convolution-arithmetic) - - [Deconvolutional Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf) - """ - - def __init__(self, filters, kernel_size, - output_shape, activation=None, weights=None, - padding='valid', strides=(1, 1, 1), data_format=None, - kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, - kernel_constraint=None, bias_constraint=None, - use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', **kwargs): - if padding not in {'valid', 'same', 'full'}: - raise ValueError('Invalid border mode for Deconvolution3D:', padding) - if len(output_shape) == 4: - # missing the batch size - output_shape = (None,) + tuple(output_shape) - - self.output_shape_ = output_shape - - super(Deconvolution3D, self).__init__(kernel_size=kernel_size, - filters=filters, - activation=activation, - weights=weights, - padding=padding, - strides=strides, - data_format=data_format, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - activity_regularizer=activity_regularizer, - kernel_constraint=kernel_constraint, - bias_constraint=bias_constraint, - use_bias=use_bias, - kernel_initializer=kernel_initializer, - bias_initializer=bias_initializer, - **kwargs) - - def compute_output_shape(self, input_shape): - if self.data_format == 'channels_first': - conv_dim1 = self.output_shape_[2] - conv_dim2 = self.output_shape_[3] - conv_dim3 = self.output_shape_[4] - return (input_shape[0], self.filters, conv_dim1, conv_dim2, conv_dim3) - elif self.data_format == 'channels_last': - conv_dim1 = self.output_shape_[1] - conv_dim2 = self.output_shape_[2] - conv_dim3 = self.output_shape_[3] - return (input_shape[0], conv_dim1, conv_dim2, conv_dim3, self.filters) - else: - raise ValueError('Invalid data format: ', self.data_format) - - def call(self, x, mask=None): - kernel_shape = K.get_value(self.kernel).shape - output = K.deconv3d(x, self.kernel, self.output_shape_, - strides=self.strides, - padding=self.padding, - data_format=self.data_format, - filter_shape=kernel_shape) - if self.use_bias: - if self.data_format == 'channels_first': - output += K.reshape(self.bias, (1, self.filters, 1, 1, 1)) - elif self.data_format == 'channels_last': - output += K.reshape(self.bias, (1, 1, 1, 1, self.filters)) - else: - raise ValueError('Invalid data_format: ', self.data_format) - output = self.activation(output) - return output - - def get_config(self): - config = {'output_shape': self.output_shape_} - base_config = super(Deconvolution3D, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - -Deconv3D = Deconvolution3D -get_custom_objects().update({'Deconvolution3D': Deconvolution3D}) -get_custom_objects().update({'Deconv3D': Deconv3D}) - - class CosineConvolution2D(Layer): """Cosine Normalized Convolution operator for filtering windows of two-dimensional inputs. Cosine Normalization: Using Cosine Similarity Instead of Dot Product in Neural Networks diff --git a/keras_contrib/layers/normalization.py b/keras_contrib/layers/normalization.py index 10565b2..40254cd 100644 --- a/keras_contrib/layers/normalization.py +++ b/keras_contrib/layers/normalization.py @@ -266,13 +266,13 @@ class BatchRenormalization(Layer): name='{}_running_std'.format(self.name), trainable=False) - self.r_max = K.variable(np.ones((1,)), name='{}_r_max'.format(self.name)) + self.r_max = K.variable(1, name='{}_r_max'.format(self.name)) - self.d_max = K.variable(np.zeros((1,)), name='{}_d_max'.format(self.name)) + self.d_max = K.variable(0, name='{}_d_max'.format(self.name)) - self.t = K.variable(np.zeros((1,)), name='{}_t'.format(self.name)) + self.t = K.variable(0, name='{}_t'.format(self.name)) - self.t_delta_tensor = K.variable(np.array([self.t_delta])) + self.t_delta_tensor = K.constant(self.t_delta) if self.initial_weights is not None: self.set_weights(self.initial_weights) @@ -292,13 +292,11 @@ class BatchRenormalization(Layer): mean_batch, var_batch = K.moments(inputs, reduction_axes, shift=None, keep_dims=False) std_batch = (K.sqrt(var_batch + self.epsilon)) - r_max_value = K.get_value(self.r_max) r = std_batch / (K.sqrt(self.running_variance + self.epsilon)) - r = K.stop_gradient(K.clip(r, 1 / r_max_value, r_max_value)) + r = K.stop_gradient(K.clip(r, 1 / self.r_max, self.r_max)) - d_max_value = K.get_value(self.d_max) d = (mean_batch - self.running_mean) / K.sqrt(self.running_variance + self.epsilon) - d = K.stop_gradient(K.clip(d, -d_max_value, d_max_value)) + d = K.stop_gradient(K.clip(d, -self.d_max, self.d_max)) if sorted(reduction_axes) == range(K.ndim(inputs))[:-1]: x_normed_batch = (inputs - mean_batch) / std_batch diff --git a/keras_contrib/layers/recurrent.py b/keras_contrib/layers/recurrent.py index e85dc22..c85a6c6 100644 --- a/keras_contrib/layers/recurrent.py +++ b/keras_contrib/layers/recurrent.py @@ -8,5 +8,3 @@ from .. import initializers from .. import regularizers from keras.engine import Layer from keras.engine import InputSpec - -from keras.layers.recurrent import _time_distributed_dense diff --git a/keras_contrib/optimizers/ftml.py b/keras_contrib/optimizers/ftml.py index edfb9d7..7545934 100644 --- a/keras_contrib/optimizers/ftml.py +++ b/keras_contrib/optimizers/ftml.py @@ -2,7 +2,6 @@ from __future__ import absolute_import from keras.optimizers import Optimizer from .. import backend as K from keras.utils.generic_utils import get_custom_objects -from keras.legacy import interfaces class FTML(Optimizer): @@ -31,7 +30,6 @@ class FTML(Optimizer): self.epsilon = epsilon self.inital_decay = decay - @interfaces.legacy_get_updates_support def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] diff --git a/setup.py b/setup.py index aad0567..3c17537 100644 --- a/setup.py +++ b/setup.py @@ -3,11 +3,32 @@ from setuptools import find_packages setup(name='keras_contrib', - version='1.2.1', - description='Keras community contributions', + version='2.0.8', + description='Keras Deep Learning for Python, Community Contributions', author='Fariz Rahman', author_email='farizrahman4u@gmail.com', url='https://github.com/farizrahman4u/keras-contrib', license='MIT', install_requires=['keras'], + extras_require={ + 'h5py': ['h5py'], + 'visualize': ['pydot>=1.2.0'], + 'tests': ['pytest', + 'pytest-pep8', + 'pytest-xdist', + 'pytest-cov'], + }, + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', + 'Topic :: Software Development :: Libraries', + 'Topic :: Software Development :: Libraries :: Python Modules' + ], packages=find_packages()) diff --git a/tests/keras_contrib/backend/backend_test.py b/tests/keras_contrib/backend/backend_test.py index 64135b0..c3829e3 100644 --- a/tests/keras_contrib/backend/backend_test.py +++ b/tests/keras_contrib/backend/backend_test.py @@ -1,7 +1,6 @@ import pytest from numpy.testing import assert_allclose import numpy as np -import scipy.sparse as sparse from keras import backend as K from keras.backend import theano_backend as KTH, floatx, set_floatx, variable @@ -157,8 +156,46 @@ class TestBackend(object): th_var_val = KTH.eval(th_var) tf_var_val = KTF.eval(tf_var) - assert_allclose(th_mean_val, tf_mean_val, rtol=1e-4) - assert_allclose(th_var_val, tf_var_val, rtol=1e-4) + # absolute tolerance needed when working with zeros + assert_allclose(th_mean_val, tf_mean_val, rtol=1e-4, atol=1e-10) + assert_allclose(th_var_val, tf_var_val, rtol=1e-4, atol=1e-10) + + def test_clip(self): + check_single_tensor_operation('clip', (4, 2), min_value=0.4, max_value=0.6) + check_single_tensor_operation('clip', (4, 2), min_value=0.4, max_value=None) + + cases = [ + # (x, min_value, max_value, expected) + (1, 0, 2, 1), + (1, 2, 0, 2), + (-1, 0, 2, 0), + (-1, 2, 0, 2), + (3, 0, 2, 2), + (3, 2, 0, 2), + (1, 0, np.inf, 1), + (1, np.inf, 0, np.inf), + (1, 0, -np.inf, 0), + (1, -np.inf, 0, 0), + (-1, 0, -np.inf, 0), + (-1, -np.inf, 0, -1), + (1, 0, None, 1), + (-1, 0, None, 0), + + # NOTE: In the following two cases, Keras 2.0.8 raises an + # error on all backends, but this is a sensible extension. + (1, None, 0, 0), + (-1, None, 0, -1), + + # NOTE: In the following case, Keras 2.0.8 rasies an error + # for TensorFlow and Theano, but returns 0 for CNTK. This + # extends the TensorFlow and Theano backends to match the + # CNTK behavior instead of raising an error. + (0, None, None, 0), + ] + for K_, KC_ in [(KTF, KCTF), (KTH, KCTH)]: + for x, min_value, max_value, expected in cases: + actual = K_.eval(KC_.clip(K_.constant(x), min_value, max_value)) + assert_allclose(expected, actual, atol=1e-5) if __name__ == '__main__': diff --git a/tests/keras_contrib/callbacks/dead_relu_detector_test.py b/tests/keras_contrib/callbacks/dead_relu_detector_test.py index 9a37df9..5f7c396 100644 --- a/tests/keras_contrib/callbacks/dead_relu_detector_test.py +++ b/tests/keras_contrib/callbacks/dead_relu_detector_test.py @@ -1,40 +1,191 @@ import pytest -import warnings import numpy as np +import sys + +if (sys.version_info > (3, 0)): + from io import StringIO +else: + from StringIO import StringIO from keras_contrib import callbacks from keras.models import Sequential -from keras.layers import Dense +from keras.layers import Dense, Conv2D, Flatten +from keras import backend as K + +n_out = 11 # with 1 neuron dead, 1/11 is just below the threshold of 10% with verbose = False + + +def check_print(do_train, expected_warnings, nr_dead=None, perc_dead=None): + """ + Receive stdout to check if correct warning message is delivered + :param nr_dead: int + :param perc_dead: float, 10% should be written as 0.1 + """ + + saved_stdout = sys.stdout + + out = StringIO() + out.flush() + sys.stdout = out # overwrite current stdout + + do_train() + + stdoutput = out.getvalue().strip() # get prints, can be something like: "Layer dense (#0) has 2 dead neurons (20.00%)!" + str_to_count = "dead neurons" + count = stdoutput.count(str_to_count) + + sys.stdout = saved_stdout # restore stdout + out.close() + + assert expected_warnings == count + if expected_warnings and (nr_dead is not None): + str_to_check = 'has {} dead'.format(nr_dead) + assert str_to_check in stdoutput, '"{}" not in "{}"'.format(str_to_check, stdoutput) + if expected_warnings and (perc_dead is not None): + str_to_check = 'neurons ({:.2%})!'.format(perc_dead) + assert str_to_check in stdoutput, '"{}" not in "{}"'.format(str_to_check, stdoutput) def test_DeadDeadReluDetector(): - def do_test(weights, expected_warnings, verbose): - with warnings.catch_warnings(record=True) as w: - dataset = np.ones((1, 1, 1)) # data to be fed as training + n_samples = 9 + + input_shape = (n_samples, 3, 4) # 4 input features + shape_out = (n_samples, 3, n_out) # 11 output features + shape_weights = (4, n_out) + + # ignore batch size + input_shape_dense = tuple(input_shape[1:]) + + def do_test(weights, expected_warnings, verbose, nr_dead=None, perc_dead=None): + + def do_train(): + dataset = np.ones(input_shape) # data to be fed as training model = Sequential() - model.add(Dense(10, activation='relu', input_shape=(1, 1), use_bias=False, weights=[weights])) + model.add(Dense(n_out, activation='relu', input_shape=input_shape_dense, + use_bias=False, weights=[weights], name='dense')) model.compile(optimizer='sgd', loss='categorical_crossentropy') model.fit( dataset, - np.ones((1, 1, 10)), + np.ones(shape_out), + batch_size=1, epochs=1, callbacks=[callbacks.DeadReluDetector(dataset, verbose=verbose)], verbose=False ) - assert len(w) == expected_warnings - for warn_item in w: - assert issubclass(warn_item.category, RuntimeWarning) - assert "dead neurons" in str(warn_item.message) - weights_1_dead = np.ones((1, 10)) # weights that correspond to NN with 1/10 neurons dead + check_print(do_train, expected_warnings, nr_dead, perc_dead) + + weights_1_dead = np.ones(shape_weights) # weights that correspond to NN with 1/11 neurons dead + weights_2_dead = np.ones(shape_weights) # weights that correspond to NN with 2/11 neurons dead + weights_all_dead = np.zeros(shape_weights) # weights that correspond to all neurons dead + weights_1_dead[:, 0] = 0 - weights_2_dead = np.ones((1, 10)) # weights that correspond to NN with 2/10 neurons dead - weights_2_dead[:, 0] = 0 - weights_2_dead[:, 1] = 0 + weights_2_dead[:, 0:2] = 0 - do_test(weights_1_dead, verbose=True, expected_warnings=1) + do_test(weights_1_dead, verbose=True, expected_warnings=1, nr_dead=1, perc_dead=1. / n_out) do_test(weights_1_dead, verbose=False, expected_warnings=0) - do_test(weights_2_dead, verbose=True, expected_warnings=1) + do_test(weights_2_dead, verbose=True, expected_warnings=1, nr_dead=2, perc_dead=2. / n_out) + # do_test(weights_all_dead, verbose=True, expected_warnings=1, nr_dead=n_out, perc_dead=1.) + + +def test_DeadDeadReluDetector_bias(): + n_samples = 9 + + input_shape = (n_samples, 4) # 4 input features + shape_weights = (4, n_out) + shape_bias = (n_out, ) + shape_out = (n_samples, n_out) # 11 output features + + # ignore batch size + input_shape_dense = tuple(input_shape[1:]) + + def do_test(weights, bias, expected_warnings, verbose, nr_dead=None, perc_dead=None): + + def do_train(): + dataset = np.ones(input_shape) # data to be fed as training + model = Sequential() + model.add(Dense(n_out, activation='relu', input_shape=input_shape_dense, + use_bias=True, weights=[weights, bias], name='dense')) + model.compile(optimizer='sgd', loss='categorical_crossentropy') + model.fit( + dataset, + np.ones(shape_out), + batch_size=1, + epochs=1, + callbacks=[callbacks.DeadReluDetector(dataset, verbose=verbose)], + verbose=False + ) + + check_print(do_train, expected_warnings, nr_dead, perc_dead) + + weights_1_dead = np.ones(shape_weights) # weights that correspond to NN with 1/11 neurons dead + weights_2_dead = np.ones(shape_weights) # weights that correspond to NN with 2/11 neurons dead + weights_all_dead = np.zeros(shape_weights) # weights that correspond to all neurons dead + + weights_1_dead[:, 0] = 0 + weights_2_dead[:, 0:2] = 0 + + bias = np.zeros(shape_bias) + + do_test(weights_1_dead, bias, verbose=True, expected_warnings=1, nr_dead=1, perc_dead=1. / n_out) + do_test(weights_1_dead, bias, verbose=False, expected_warnings=0) + do_test(weights_2_dead, bias, verbose=True, expected_warnings=1, nr_dead=2, perc_dead=2. / n_out) + # do_test(weights_all_dead, bias, verbose=True, expected_warnings=1, nr_dead=n_out, perc_dead=1.) + + +def test_DeadDeadReluDetector_conv(): + n_samples = 9 + + # (5, 5) kernel, 4 input featuremaps and 11 output featuremaps + if K.image_data_format() == 'channels_last': + input_shape = (n_samples, 5, 5, 4) + else: + input_shape = (n_samples, 4, 5, 5) + + # ignore batch size + input_shape_conv = tuple(input_shape[1:]) + shape_weights = (5, 5, 4, n_out) + shape_out = (n_samples, n_out) + + def do_test(weights_bias, expected_warnings, verbose, nr_dead=None, perc_dead=None): + """ + :param perc_dead: as float, 10% should be written as 0.1 + """ + + def do_train(): + dataset = np.ones(input_shape) # data to be fed as training + model = Sequential() + model.add(Conv2D(n_out, (5, 5), activation='relu', input_shape=input_shape_conv, + use_bias=True, weights=weights_bias, name='conv')) + model.add(Flatten()) # to handle Theano's categorical crossentropy + model.compile(optimizer='sgd', loss='categorical_crossentropy') + model.fit( + dataset, + np.ones(shape_out), + batch_size=1, + epochs=1, + callbacks=[callbacks.DeadReluDetector(dataset, verbose=verbose)], + verbose=False + ) + + check_print(do_train, expected_warnings, nr_dead, perc_dead) + + weights_1_dead = np.ones(shape_weights) # weights that correspond to NN with 1/11 neurons dead + weights_1_dead[..., 0] = 0 + weights_2_dead = np.ones(shape_weights) # weights that correspond to NN with 2/11 neurons dead + weights_2_dead[..., 0:2] = 0 + weights_all_dead = np.zeros(shape_weights) # weights that correspond to NN with all neurons dead + + bias = np.zeros((11, )) + + weights_bias_1_dead = [weights_1_dead, bias] + weights_bias_2_dead = [weights_2_dead, bias] + weights_bias_all_dead = [weights_all_dead, bias] + + do_test(weights_bias_1_dead, verbose=True, expected_warnings=1, nr_dead=1, perc_dead=1. / n_out) + do_test(weights_bias_1_dead, verbose=False, expected_warnings=0) + do_test(weights_bias_2_dead, verbose=True, expected_warnings=1, nr_dead=2, perc_dead=2. / n_out) + # do_test(weights_bias_all_dead, verbose=True, expected_warnings=1, nr_dead=n_out, perc_dead=1.) if __name__ == '__main__': diff --git a/tests/keras_contrib/layers/test_advanced_activations.py b/tests/keras_contrib/layers/test_advanced_activations.py index f07a690..8c71426 100644 --- a/tests/keras_contrib/layers/test_advanced_activations.py +++ b/tests/keras_contrib/layers/test_advanced_activations.py @@ -26,5 +26,18 @@ def test_srelu_share(): layer_test(advanced_activations.SReLU, kwargs={'shared_axes': 1}, input_shape=(2, 3, 4)) + +@keras_test +def test_swish_constant(): + layer_test(advanced_activations.Swish, kwargs={'beta': 1.0, 'trainable': False}, + input_shape=(2, 3, 4)) + + +@keras_test +def test_swish_trainable(): + layer_test(advanced_activations.Swish, kwargs={'beta': 1.0, 'trainable': True}, + input_shape=(2, 3, 4)) + + if __name__ == '__main__': pytest.main([__file__]) diff --git a/tests/keras_contrib/layers/test_convolutional.py b/tests/keras_contrib/layers/test_convolutional.py index 1760226..d207656 100644 --- a/tests/keras_contrib/layers/test_convolutional.py +++ b/tests/keras_contrib/layers/test_convolutional.py @@ -17,67 +17,6 @@ else: _convolution_border_modes = ['valid', 'same'] -@keras_test -def test_deconvolution_3d(): - num_samples = 6 - num_filter = 4 - stack_size = 2 - kernel_dim1 = 12 - kernel_dim2 = 10 - kernel_dim3 = 8 - - for batch_size in [None, num_samples]: - for border_mode in _convolution_border_modes: - for subsample in [(1, 1, 1), (2, 2, 2)]: - if border_mode == 'same' and subsample != (1, 1, 1): - continue - - dim1 = conv_input_length(kernel_dim1, 7, - border_mode, - subsample[0]) - dim2 = conv_input_length(kernel_dim2, 5, - border_mode, - subsample[1]) - dim3 = conv_input_length(kernel_dim3, 3, - border_mode, - subsample[2]) - layer_test(convolutional.Deconvolution3D, - kwargs={'filters': num_filter, - 'kernel_size': (7, 5, 3), - 'output_shape': (batch_size, num_filter, dim1, dim2, dim3), - 'padding': border_mode, - 'strides': subsample, - 'data_format': 'channels_first'}, - input_shape=(num_samples, stack_size, kernel_dim1, kernel_dim2, kernel_dim3), - - fixed_batch_size=True, tolerance=None) - - layer_test(convolutional.Deconvolution3D, - kwargs={'filters': num_filter, - 'kernel_size': (7, 5, 3), - 'output_shape': (batch_size, num_filter, dim1, dim2, dim3), - 'padding': border_mode, - 'strides': subsample, - 'data_format': 'channels_first', - 'kernel_regularizer': 'l2', - 'bias_regularizer': 'l2', - 'activity_regularizer': 'l2'}, - input_shape=(num_samples, stack_size, kernel_dim1, kernel_dim2, kernel_dim3), - fixed_batch_size=True, tolerance=None) - - layer_test(convolutional.Deconvolution3D, - kwargs={'filters': num_filter, - 'kernel_size': (7, 5, 3), - 'output_shape': (num_filter, dim1, dim2, dim3), - 'padding': border_mode, - 'strides': subsample, - 'data_format': 'channels_first', - 'kernel_regularizer': 'l2', - 'bias_regularizer': 'l2', - 'activity_regularizer': 'l2'}, - input_shape=(num_samples, stack_size, kernel_dim1, kernel_dim2, kernel_dim3), tolerance=None) - - @keras_test def test_cosineconvolution_2d(): num_samples = 2 diff --git a/tests/keras_contrib/layers/test_normalization.py b/tests/keras_contrib/layers/test_normalization.py index fe2a172..1c74a04 100644 --- a/tests/keras_contrib/layers/test_normalization.py +++ b/tests/keras_contrib/layers/test_normalization.py @@ -188,7 +188,7 @@ def test_instancenorm_perchannel_correctness(): for channel in range(3): activations = out[instance, channel] assert abs(activations.mean()) > 1e-2 - assert abs(activations.std() - 1.0) > 1e-2 + assert abs(activations.std() - 1.0) > 1e-6 # but values are still normalized per-instance activations = out[instance] @@ -305,5 +305,37 @@ def test_shared_batchrenorm(): new_model.train_on_batch(x, x) +@keras_test +def test_batchrenorm_clipping_schedule(): + '''Test that the clipping schedule isn't fixed at r_max=1, d_max=0''' + inp = Input(shape=(10,)) + bn = normalization.BatchRenormalization(t_delta=1.) + out = bn(inp) + model = Model(inp, out) + model.compile('sgd', 'mse') + + x = np.random.normal(5, 10, size=(2, 10)) + y = np.random.normal(5, 10, size=(2, 10)) + + r_max, d_max = K.get_value(bn.r_max), K.get_value(bn.d_max) + assert r_max == 1 + assert d_max == 0 + + for i in range(10): + model.train_on_batch(x, y) + + r_max, d_max = K.get_value(bn.r_max), K.get_value(bn.d_max) + assert_allclose([r_max, d_max], [3, 5], atol=1e-1) + + +@keras_test +def test_batchrenorm_get_config(): + '''Test that get_config works on a model with a batchrenorm layer.''' + x = Input(shape=(10,)) + y = normalization.BatchRenormalization()(x) + model = Model(x, y) + model.get_config() + + if __name__ == '__main__': pytest.main([__file__]) diff --git a/tests/keras_contrib/utils/save_load_utils_test.py b/tests/keras_contrib/utils/save_load_utils_test.py index 67f55fc..a11e826 100644 --- a/tests/keras_contrib/utils/save_load_utils_test.py +++ b/tests/keras_contrib/utils/save_load_utils_test.py @@ -1,12 +1,16 @@ import pytest +import os from keras import backend as K from keras.layers import Input, Dense from keras.models import Model from numpy.testing import assert_allclose +from keras.utils.test_utils import keras_test from keras_contrib.utils.save_load_utils import save_all_weights, load_all_weights +@pytest.mark.skipif(K.backend() != 'tensorflow', reason='save_all_weights and load_all_weights only supported on TensorFlow') +@keras_test def test_save_and_load_all_weights(): ''' Test save_all_weights and load_all_weights. Save and load optimizer and model weights but not configuration. @@ -33,15 +37,16 @@ def test_save_and_load_all_weights(): ow1value[0, 0:3] = [4, 2, 0] K.set_value(ow1, ow1value) # save all weights - save_all_weights(m1, "model.h5") + save_all_weights(m1, 'model.h5') # new model m2 = make_model() # load all weights - load_all_weights(m2, "model.h5") + load_all_weights(m2, 'model.h5') # check weights assert_allclose(K.get_value(m2.layers[1].kernel)[0, 0:4], [1, 3, 3, 7]) # check optimizer weights assert_allclose(K.get_value(m2.optimizer.weights[3])[0, 0:3], [4, 2, 0]) + os.remove('model.h5') if __name__ == '__main__':