[tune] Added pbt with keras on cifar10 dataset example (#1729)

* [tune] Added pbt with keras on cifar10 dataset example * ENH: add gpu resources * CLN: requires 4 GPUs resource * CLN: use single quotes * CLN: don't save model by default
2026-06-28 21:46:57 +08:00 · 2018-03-26 06:57:23 +08:00
parent 0ad1054b8b
commit 6b1e592d5c
1 changed files with 176 additions and 0 deletions
@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""Train keras CNN on the CIFAR10 small images dataset.
+
+The model comes from: https://zhuanlan.zhihu.com/p/29214791,
+and it gets to about 87% validation accuracy in 100 epochs.
+
+Note that the scipt requires a machine with 4 GPUs. You
+can set {'gpu': 0} to use CPUs for training, although
+it is less efficient.
+"""
+
+from __future__ import print_function
+
+import argparse
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.keras.backend import set_session
+from tensorflow.python.keras.datasets import cifar10
+from tensorflow.python.keras.layers import Input, Dense, Dropout, Flatten
+from tensorflow.python.keras.layers import Convolution2D, MaxPooling2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
+
+import ray
+from ray.tune import grid_search, run_experiments
+from ray.tune import register_trainable
+from ray.tune import Trainable
+from ray.tune import TrainingResult
+from ray.tune.pbt import PopulationBasedTraining
+
+
+num_classes = 10
+
+
+class Cifar10Model(Trainable):
+
+    def _read_data(self):
+        # The data, split between train and test sets:
+        (x_train, y_train), (x_test, y_test) = cifar10.load_data()
+
+        # Convert class vectors to binary class matrices.
+        y_train = tf.keras.utils.to_categorical(y_train, num_classes)
+        y_test = tf.keras.utils.to_categorical(y_test, num_classes)
+
+        x_train = x_train.astype('float32')
+        x_train /= 255
+        x_test = x_test.astype('float32')
+        x_test /= 255
+
+        return (x_train, y_train), (x_test, y_test)
+
+    def _build_model(self, input_shape):
+        x = Input(shape=(32, 32, 3))
+        y = x
+        y = Convolution2D(filters=64, kernel_size=3, strides=1, padding='same', activation='relu',
+                          kernel_initializer='he_normal')(y)
+        y = Convolution2D(filters=64, kernel_size=3, strides=1, padding='same', activation='relu',
+                          kernel_initializer='he_normal')(y)
+        y = MaxPooling2D(pool_size=2, strides=2, padding='same')(y)
+
+        y = Convolution2D(filters=128, kernel_size=3, strides=1, padding='same', activation='relu',
+                          kernel_initializer='he_normal')(y)
+        y = Convolution2D(filters=128, kernel_size=3, strides=1, padding='same', activation='relu',
+                          kernel_initializer='he_normal')(y)
+        y = MaxPooling2D(pool_size=2, strides=2, padding='same')(y)
+
+        y = Convolution2D(filters=256, kernel_size=3, strides=1, padding='same', activation='relu',
+                          kernel_initializer='he_normal')(y)
+        y = Convolution2D(filters=256, kernel_size=3, strides=1, padding='same', activation='relu',
+                          kernel_initializer='he_normal')(y)
+        y = MaxPooling2D(pool_size=2, strides=2, padding='same')(y)
+
+        y = Flatten()(y)
+        y = Dropout(self.config['dropout'])(y)
+        y = Dense(units=10, activation='softmax', kernel_initializer='he_normal')(y)
+
+        model = Model(inputs=x, outputs=y, name='model1')
+        return model
+
+    def _setup(self):
+        self.train_data, self.test_data = self._read_data()
+        x_train = self.train_data[0]
+        model = self._build_model(x_train.shape[1:])
+
+        opt = tf.keras.optimizers.Adadelta()
+        model.compile(loss='categorical_crossentropy',
+                      optimizer=opt,
+                      metrics=['accuracy'])
+        self.model = model
+
+    def _train(self):
+        x_train, y_train = self.train_data
+        x_test, y_test = self.test_data
+
+        aug_gen = ImageDataGenerator(
+            featurewise_center=False,             # set input mean to 0 over the dataset
+            samplewise_center=False,              # set each sample mean to 0
+            featurewise_std_normalization=False,  # divide inputs by std of the dataset
+            samplewise_std_normalization=False,   # divide each input by its std
+            zca_whitening=False,                  # apply ZCA whitening
+            rotation_range=0,                     # randomly rotate images in the range (degrees, 0 to 180)
+            width_shift_range=0.1,                # randomly shift images horizontally (fraction of total width)
+            height_shift_range=0.1,               # randomly shift images vertically (fraction of total height)
+            horizontal_flip=True,                 # randomly flip images
+            vertical_flip=False,                  # randomly flip images
+        )
+
+        aug_gen.fit(x_train)
+        gen = aug_gen.flow(x_train, y_train, batch_size=self.config['batch_size'])
+        self.model.fit_generator(generator=gen,
+                                 steps_per_epoch=50000 // self.config['batch_size'],
+                                 epochs=self.config['epochs'],
+                                 validation_data=None)
+
+        # loss, accuracy
+        _, accuracy = self.model.evaluate(x_test, y_test, verbose=0)
+        return TrainingResult(timesteps_this_iter=10,
+                              mean_accuracy=accuracy)
+
+    def _save(self, checkpoint_dir):
+        file_path = checkpoint_dir + '/model'
+        self.model.save_weights(file_path)
+        return file_path
+
+    def _restore(self, path):
+        self.model.load_weights(path)
+
+    def _stop(self):
+        # If need, save your model when exit.
+        # saved_path = self.model.save(self.logdir)
+        # print('save model at: ', saved_path)
+        pass
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--smoke-test',
+                        action='store_true',
+                        help='Finish quickly for testing')
+    args, _ = parser.parse_known_args()
+
+    register_trainable('train_cifar10', Cifar10Model)
+    train_spec = {
+        'run': 'train_cifar10',
+        'resources': { 'cpu': 6, 'gpu': 4 },
+        'stop': {
+            'mean_accuracy': 0.80,
+            'timesteps_total': 300,
+        },
+        'config': {
+            'epochs': 1,
+            'batch_size': 64,
+            'lr': grid_search([10 ** -4, 10 ** -5]),
+            'decay': lambda spec: spec.config.lr / 100.0,
+            'dropout': grid_search([0.25, 0.5]),
+        },
+        'repeat': 1,
+    }
+
+    if args.smoke_test:
+      train_spec['config']['lr'] = 10 ** -4
+      train_spec['config']['dropout'] = 0.5
+
+    ray.init()
+
+    pbt = PopulationBasedTraining(
+        time_attr='timesteps_total', reward_attr='mean_accuracy',
+        perturbation_interval=10,
+        hyperparam_mutations={
+            'dropout': lambda _: np.random.uniform(0, 1),
+        })
+
+    run_experiments({'pbt_cifar10': train_spec},
+                    scheduler=pbt)