In this project, I will use the Quick Draw dataset.
The Quick Draw Dataset is a collection of 50 million drawings across 345 categories, contributed by players of the game Quick, Draw!. The bitmap dataset contains these drawings converted from vector format into 28x28 grayscale images.
I will focus on a subset of 10 classes of type "body part"
The class labels are:
Label |
Description |
---|---|
0 |
arm |
1 |
ear |
2 |
eye |
3 |
face |
4 |
foot |
5 |
hand |
6 |
knee |
7 |
leg |
8 |
mouth |
9 |
nose |
Players draw items as quickly as possible, and as you can see, some images are quite bad! I will try and train a CNN classifier to anyway see if I can train a model to distinguish the classes. I will then train an autoencoder to learn a laten space representations of the dataset.
# Standard import(s)
import os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import pandas as pd
if float(tf.__version__[:3])>2.8:
from tensorflow.keras.utils import plot_model
else:
from tensorflow.python.keras.utils.vis_utils import plot_model
# Keras import(s)
from tensorflow.python.keras import metrics
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, Flatten, Reshape, Conv2D,Conv2DTranspose, MaxPooling2D, AveragePooling2D, UpSampling2D
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from tensorflow.keras import regularizers
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()
from tensorflow.python.keras.backend import set_image_data_format
set_image_data_format('channels_last') # Images are formatted as (N, N, 1), with colour-channels last
First, I will load in the preprocessed dataset from a numpy data file, which has already been split into training and testing parts for us.
#Load quickdraw bodyparts dataset
X_train, y_train, X_test, y_test = np.load('quickdraw_bodyparts.npy',allow_pickle=True)
#Image labels
body_parts = ['arm', 'ear', 'eye', 'face','foot', 'hand', 'knee', 'leg', 'mouth','nose']
# Each label of the dataset correspond to a class name
class_names = body_parts
nb_train = X_train.shape[0]
nb_test = X_test.shape[0]
nb_classes = y_train.max() + 1
print("Number of training examples: {}".format(nb_train))
print("Number of testing examples: {}".format(nb_test))
print("Number of target classes: {}".format(nb_classes))
# Get image shape
shape = X_train.shape[1:]
print("Image shape: {}".format(shape))
Number of training examples: 160000 Number of testing examples: 40000 Number of target classes: 10.0 Image shape: (28, 28, 1)
Plotting 10 training dataset images, one per category with label.
plt.figure(figsize=(10,10))
for i in range(len(body_parts)):
plt.subplot(1,10,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(X_train[y_train == i][0], cmap=plt.cm.binary)
plt.xlabel("{}({})".format(class_names[i],i))
plt.show()
plot the fist 25 images in a 5x5 plot. Label the images.
nb_show = 25
plt.figure(figsize=(10,10))
for i in range(nb_show):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(X_train[i], cmap=plt.cm.binary)
plt.xlabel(class_names[int(y_train[i])])
plt.show()
Applying different kernels to an image of type face
myface = X_train[y_train == 3][0]
plt.imshow(myface,cmap=plt.cm.binary)
<matplotlib.image.AxesImage at 0x7f6b8f8d8fd0>
3x3
numpy arrays. We add them to a dictionary of kernels.k_identity = np.zeros((3,3,1))
k_identity[1,1] = 1
k_edge= np.array([[[-1],[-1],[-1]],[[-1],[8],[-1]],[[-1],[-1],[-1]]])
k_sharp= np.array([[[ 0],[-1],[ 0]],[[-1],[4],[-1]],[[ 0],[-1],[ 0]]])
k_blur = np.ones((3,3,1))/9
kernels = {"identity":k_identity,"edge":k_edge,"sharp":k_sharp,"blur":k_blur}
myface.shape
(28, 28, 1)
def run_kernel(fig,kernel):
#empty array holding the output image. Define the output dimensions as excercise.
out = np.empty((fig.shape[0]-2,fig.shape[1]-2,1))
for i in range(fig.shape[0]-2):
for j in range(fig.shape[1]-2):
out[i][j] = np.sum(np.multiply(kernel,fig[i:i+kernel.shape[0],j:j+kernel.shape[1]]))
return out
#Import the normiliser for the bitmap
from matplotlib.colors import Normalize as norm
plt.figure(figsize=(10,10))
#Loop over kernel
for i,kernel in enumerate(kernels):
plt.subplot(1,4,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
#return transformed image and plot it
plt.imshow(run_kernel(myface,kernels[kernel]), cmap=plt.cm.binary,norm=norm(0,1))
plt.xlabel(kernel)
plt.show()
Train a convolutional neural network (CNN) to perform hand-written digit recognition; a standard form of image classification.
# Connect input, intermediate, and output layers using the Keras functional API
i = Input(shape=shape)
x = Conv2D(8, kernel_size=(3,3), activation='relu',kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4))(i)
x = MaxPooling2D(pool_size=(2,2))(x)
x = Conv2D(16, kernel_size=(3,3), activation='relu',kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),)(x)
x = MaxPooling2D(pool_size=(2,2))(x)
x = Conv2D(24, kernel_size=(3,3), activation='relu',kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),)(x)
x = Flatten()(x)
x = Dense(20, activation='relu')(x)
o = Dense(nb_classes, activation='softmax')(x)
# Create Model
cnn = Model(i, o, name='CNN')
plot_model(cnn, show_shapes=True)
cnn.summary()
Model: "CNN" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_1 (InputLayer) [(None, 28, 28, 1)] 0 _________________________________________________________________ conv2d (Conv2D) (None, 26, 26, 8) 80 _________________________________________________________________ max_pooling2d (MaxPooling2D) (None, 13, 13, 8) 0 _________________________________________________________________ conv2d_1 (Conv2D) (None, 11, 11, 16) 1168 _________________________________________________________________ max_pooling2d_1 (MaxPooling2 (None, 5, 5, 16) 0 _________________________________________________________________ conv2d_2 (Conv2D) (None, 3, 3, 24) 3480 _________________________________________________________________ flatten (Flatten) (None, 216) 0 _________________________________________________________________ dense (Dense) (None, 20) 4340 _________________________________________________________________ dense_1 (Dense) (None, 10) 210 ================================================================= Total params: 9,278 Trainable params: 9,278 Non-trainable params: 0 _________________________________________________________________
cnn.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
history_cnn = cnn.fit(x=X_train, y=y_train, epochs=20, validation_split=0.2, batch_size=32)
Train on 128000 samples, validate on 32000 samples
2022-10-23 00:59:52.221510: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-10-23 00:59:52.249506: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-10-23 00:59:52.249799: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-10-23 00:59:52.250588: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2022-10-23 00:59:52.251435: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-10-23 00:59:52.251812: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-10-23 00:59:52.252161: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-10-23 00:59:52.702882: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-10-23 00:59:52.703279: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-10-23 00:59:52.703652: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-10-23 00:59:52.704490: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2205 MB memory: -> device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5 2022-10-23 00:59:52.716611: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled
Epoch 1/20
2022-10-23 00:59:53.425089: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8401 2022-10-23 00:59:53.901925: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory 2022-10-23 00:59:53.902546: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory 2022-10-23 00:59:53.902560: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Couldn't invoke ptxas --version 2022-10-23 00:59:53.903266: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory 2022-10-23 00:59:53.903325: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: Failed to launch ptxas Relying on driver to perform ptx compilation. Modify $PATH to customize ptxas location. This message will be only logged once.
127680/128000 [============================>.] - ETA: 0s - loss: 0.7989 - accuracy: 0.7516
/home/guillermo/miniconda/envs/daml23/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:2458: UserWarning: `Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically. warnings.warn('`Model.state_updates` will be removed in a future version. '
128000/128000 [==============================] - 9s 69us/sample - loss: 0.7984 - accuracy: 0.7517 - val_loss: 0.6221 - val_accuracy: 0.8067 Epoch 2/20 128000/128000 [==============================] - 8s 59us/sample - loss: 0.5849 - accuracy: 0.8170 - val_loss: 0.5748 - val_accuracy: 0.8201 Epoch 3/20 128000/128000 [==============================] - 7s 58us/sample - loss: 0.5374 - accuracy: 0.8301 - val_loss: 0.5354 - val_accuracy: 0.8304 Epoch 4/20 128000/128000 [==============================] - 7s 55us/sample - loss: 0.5072 - accuracy: 0.8399 - val_loss: 0.5256 - val_accuracy: 0.8337 Epoch 5/20 128000/128000 [==============================] - 8s 61us/sample - loss: 0.4873 - accuracy: 0.8466 - val_loss: 0.5040 - val_accuracy: 0.8418 Epoch 6/20 128000/128000 [==============================] - 7s 59us/sample - loss: 0.4734 - accuracy: 0.8518 - val_loss: 0.4976 - val_accuracy: 0.8416 Epoch 7/20 128000/128000 [==============================] - 8s 61us/sample - loss: 0.4615 - accuracy: 0.8552 - val_loss: 0.4831 - val_accuracy: 0.8478 Epoch 8/20 128000/128000 [==============================] - 7s 59us/sample - loss: 0.4524 - accuracy: 0.8576 - val_loss: 0.4800 - val_accuracy: 0.8487 Epoch 9/20 128000/128000 [==============================] - 8s 59us/sample - loss: 0.4449 - accuracy: 0.8598 - val_loss: 0.4779 - val_accuracy: 0.8508 Epoch 10/20 128000/128000 [==============================] - 8s 59us/sample - loss: 0.4383 - accuracy: 0.8628 - val_loss: 0.4781 - val_accuracy: 0.8508 Epoch 11/20 128000/128000 [==============================] - 8s 65us/sample - loss: 0.4332 - accuracy: 0.8636 - val_loss: 0.4700 - val_accuracy: 0.8518 Epoch 12/20 128000/128000 [==============================] - 9s 72us/sample - loss: 0.4282 - accuracy: 0.8653 - val_loss: 0.4642 - val_accuracy: 0.8547 Epoch 13/20 128000/128000 [==============================] - 11s 87us/sample - loss: 0.4246 - accuracy: 0.8665 - val_loss: 0.4730 - val_accuracy: 0.8510 Epoch 14/20 128000/128000 [==============================] - 11s 85us/sample - loss: 0.4226 - accuracy: 0.8669 - val_loss: 0.4682 - val_accuracy: 0.8534 Epoch 15/20 128000/128000 [==============================] - 9s 67us/sample - loss: 0.4189 - accuracy: 0.8681 - val_loss: 0.4659 - val_accuracy: 0.8537 Epoch 16/20 128000/128000 [==============================] - 8s 60us/sample - loss: 0.4150 - accuracy: 0.8698 - val_loss: 0.4662 - val_accuracy: 0.8538 Epoch 17/20 128000/128000 [==============================] - 9s 70us/sample - loss: 0.4134 - accuracy: 0.8700 - val_loss: 0.4802 - val_accuracy: 0.8507 Epoch 18/20 128000/128000 [==============================] - 9s 73us/sample - loss: 0.4101 - accuracy: 0.8711 - val_loss: 0.4691 - val_accuracy: 0.8517 Epoch 19/20 128000/128000 [==============================] - 12s 90us/sample - loss: 0.4092 - accuracy: 0.8711 - val_loss: 0.4716 - val_accuracy: 0.8558 Epoch 20/20 128000/128000 [==============================] - 9s 74us/sample - loss: 0.4074 - accuracy: 0.8721 - val_loss: 0.4615 - val_accuracy: 0.8578
X_train.shape
(160000, 28, 28, 1)
o.shape
TensorShape([None, 10])
plt.plot(history_cnn.history['loss'])
plt.plot(history_cnn.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# Evaluating performance on test dataset
_, acc = cnn.evaluate(X_test, y_test)
print("Testing accuray: {:.1f}%".format(acc * 100.))
Testing accuray: 85.5%
# Show some examples of misclassifications
# Predict the probabilities of 10 classes
p_test = cnn.predict(X_test)
#Pick the index of the highest class per image
py_test = p_test.argmax(axis=-1)
#Check where the prediction fails to match the target
ix_wrong = np.where(py_test != y_test)[0]
plt.figure(figsize=(10,10))
for i in range(nb_show):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(X_test[ix_wrong[i]], cmap='Greys')
plt.xlabel("{} ({})".format(class_names[py_test[ix_wrong[i]]],
class_names[int(y_test[ix_wrong[i]])]))
plt.show()
print("True label: {}".format(y_test[ix_wrong[:nb_show]]))
print("Pred. label: {}".format(py_test[ix_wrong[:nb_show]]))
True label: [7. 4. 0. 4. 9. 0. 6. 6. 9. 9. 9. 1. 9. 0. 9. 3. 4. 6. 9. 6. 6. 0. 9. 0. 7.] Pred. label: [6 7 7 2 6 4 9 7 6 4 6 4 0 4 4 2 0 4 7 7 9 4 6 9 4]
cm = confusion_matrix(y_test, py_test, normalize="true")
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(cm, cmap=plt.cm.Blues,annot=True,fmt=".3f")
plt.xlabel("Predicted labels")
plt.ylabel("True labels")
ax.set_yticklabels(body_parts)
ax.set_xticklabels(body_parts)
plt.title('Confusion matrix ')
plt.show()
The misclassified images seem pretty reasonable; those are ones that I might also have struggled with since people have varying degrees of aptitude for drawing. Oarticulary hand vs foot distinction seems difficult.
Finally, the confusion matrix quite good on the diagonal, around half over 90% — that's great considering how horrible some drawings are! The off-diagonal elements with the largest values are e.g. leg vs knee and foot. Not surprising since many people draw the whole leg when drawing a knee or foot.
myface.shape
(28, 28, 1)
plt.figure(figsize=(10,10))
for i in range(8):
plt.subplot(8,2,2*(i)+1)
sns.heatmap(cnn.layers[1].get_weights()[0][:,:,:,i][:,:,0], cmap='Greys',annot=True,fmt=".1f")
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.subplot(8,2,2*i+2)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(run_kernel(myface,cnn.layers[1].get_weights()[0][:,:,:,i]),cmap=plt.cm.binary,norm=norm(0,1))
plt.show()
In this section, I will construct and train an auto-encoder as an example of unsupervised learning. This type of model is made up of an encoder that maps an image of shape $(N,M,C)$ into a latent vector of shape $(L,)$; and a decoder which maps the latent vector into an image with the same shape as the original. The latent, or encoded, vector will typically be small in size compared to the input image (i.e. $L \ll N \times M \times C$), meaning that it is an information bottleneck. The aim of the auto-encoder is therefore to learn the most efficient encoding of a class of images (here: quickdraw dataset) that allows the decoder to reconstruct the original as well as possible under the bottleneck constraint.
i = Input(shape=shape, name='input')
x = i
x = Conv2D(16, kernel_size=(3,3), padding='same', activation='relu')(x)
x = MaxPooling2D(pool_size=(2,2))(x)
x = Conv2D( 8, kernel_size=(3,3), padding='same', activation='relu')(x)
x = MaxPooling2D(pool_size=(2,2))(x)
x = Conv2D( 4, kernel_size=(3,3), padding='same', activation='relu')(x)
x = Conv2D( 2, kernel_size=(3,3), padding='same', activation='relu')(x)
x = Conv2D( 1, kernel_size=(3,3), padding='same', activation='relu')(x)
e = Flatten()(x)
encoder = Model(i, e, name='encoder')
encoder.summary()
Model: "encoder" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input (InputLayer) [(None, 28, 28, 1)] 0 _________________________________________________________________ conv2d_3 (Conv2D) (None, 28, 28, 16) 160 _________________________________________________________________ max_pooling2d_2 (MaxPooling2 (None, 14, 14, 16) 0 _________________________________________________________________ conv2d_4 (Conv2D) (None, 14, 14, 8) 1160 _________________________________________________________________ max_pooling2d_3 (MaxPooling2 (None, 7, 7, 8) 0 _________________________________________________________________ conv2d_5 (Conv2D) (None, 7, 7, 4) 292 _________________________________________________________________ conv2d_6 (Conv2D) (None, 7, 7, 2) 74 _________________________________________________________________ conv2d_7 (Conv2D) (None, 7, 7, 1) 19 _________________________________________________________________ flatten_1 (Flatten) (None, 49) 0 ================================================================= Total params: 1,705 Trainable params: 1,705 Non-trainable params: 0 _________________________________________________________________
encoding_shape = encoder.layers[-2].output_shape[1:] # Otherwise, it is fine to hard-code this shape/size
# Define convolutional decoder model
t = Input(shape=(np.prod(encoding_shape),))
x = Reshape(encoding_shape)(t)
x = Conv2D(16, kernel_size=(3,3), padding='same', activation='relu')(x)
x = UpSampling2D(size=(2,2))(x)
x = Conv2D(8, kernel_size=(3,3), padding='same', activation='relu')(x)
x = UpSampling2D(size=(2,2))(x)
x = Conv2D(4, kernel_size=(3,3), padding='same', activation='relu')(x)
x = Conv2D(2, kernel_size=(3,3), padding='same', activation='relu')(x)
o = Conv2D(1, kernel_size=(3,3), padding='same', activation='sigmoid')(x)
decoder = Model(t, o, name='decoder')
decoder.summary()
Model: "decoder" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_2 (InputLayer) [(None, 49)] 0 _________________________________________________________________ reshape (Reshape) (None, 7, 7, 1) 0 _________________________________________________________________ conv2d_8 (Conv2D) (None, 7, 7, 16) 160 _________________________________________________________________ up_sampling2d (UpSampling2D) (None, 14, 14, 16) 0 _________________________________________________________________ conv2d_9 (Conv2D) (None, 14, 14, 8) 1160 _________________________________________________________________ up_sampling2d_1 (UpSampling2 (None, 28, 28, 8) 0 _________________________________________________________________ conv2d_10 (Conv2D) (None, 28, 28, 4) 292 _________________________________________________________________ conv2d_11 (Conv2D) (None, 28, 28, 2) 74 _________________________________________________________________ conv2d_12 (Conv2D) (None, 28, 28, 1) 19 ================================================================= Total params: 1,705 Trainable params: 1,705 Non-trainable params: 0 _________________________________________________________________
i = encoder.input
cae = Model(i, decoder(encoder(i)), name='ConvAE')
cae.summary()
Model: "ConvAE" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input (InputLayer) [(None, 28, 28, 1)] 0 _________________________________________________________________ encoder (Functional) (None, 49) 1705 _________________________________________________________________ decoder (Functional) (None, 28, 28, 1) 1705 ================================================================= Total params: 3,410 Trainable params: 3,410 Non-trainable params: 0 _________________________________________________________________
cae.compile('adam', loss='binary_crossentropy',metrics=['accuracy'])
history_cae = cae.fit(x=X_train, y=X_train, epochs=10, validation_split=0.2, batch_size=32)
Train on 128000 samples, validate on 32000 samples Epoch 1/10 128000/128000 [==============================] - ETA: 0s - loss: 0.1895 - accuracy: 0.7728
/home/guillermo/miniconda/envs/daml23/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:2458: UserWarning: `Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically. warnings.warn('`Model.state_updates` will be removed in a future version. '
128000/128000 [==============================] - 29s 224us/sample - loss: 0.1895 - accuracy: 0.7728 - val_loss: 0.1567 - val_accuracy: 0.7810 Epoch 2/10 128000/128000 [==============================] - 21s 163us/sample - loss: 0.1526 - accuracy: 0.7815 - val_loss: 0.1497 - val_accuracy: 0.7820 Epoch 3/10 128000/128000 [==============================] - 27s 207us/sample - loss: 0.1487 - accuracy: 0.7820 - val_loss: 0.1483 - val_accuracy: 0.7823 Epoch 4/10 128000/128000 [==============================] - 28s 217us/sample - loss: 0.1472 - accuracy: 0.7823 - val_loss: 0.1464 - val_accuracy: 0.7823 Epoch 5/10 128000/128000 [==============================] - 28s 216us/sample - loss: 0.1463 - accuracy: 0.7824 - val_loss: 0.1459 - val_accuracy: 0.7824 Epoch 6/10 128000/128000 [==============================] - 21s 166us/sample - loss: 0.1457 - accuracy: 0.7825 - val_loss: 0.1453 - val_accuracy: 0.7825 Epoch 7/10 128000/128000 [==============================] - 28s 217us/sample - loss: 0.1452 - accuracy: 0.7826 - val_loss: 0.1450 - val_accuracy: 0.7827 Epoch 8/10 128000/128000 [==============================] - 30s 232us/sample - loss: 0.1448 - accuracy: 0.7826 - val_loss: 0.1446 - val_accuracy: 0.7826 Epoch 9/10 128000/128000 [==============================] - 18s 141us/sample - loss: 0.1445 - accuracy: 0.7827 - val_loss: 0.1449 - val_accuracy: 0.7824 Epoch 10/10 128000/128000 [==============================] - 22s 174us/sample - loss: 0.1443 - accuracy: 0.7827 - val_loss: 0.1441 - val_accuracy: 0.7826
plt.plot(history_cae.history['loss'])
plt.plot(history_cae.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
p_test = cae.predict(X_test)
plt.figure(figsize=(10,10))
for i in range(10):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(p_test[i], cmap='Greys')
plt.title('Decoded')
plt.show()
plt.figure(figsize=(10,10))
for i in range(10):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(X_test[i], cmap='Greys')
plt.title('Target')
plt.show()
Comments:
Predict 10 images into the laten space using the encoder only.
encoded_imgs = encoder.predict(X_test)
n = 10
plt.figure(figsize=(20, 8))
for i in range(1, n + 1):
ax = plt.subplot(1, n, i)
plt.imshow(encoded_imgs[i].reshape((7, 7)).T)
plt.gray()
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
plt.show()
We have introduced auto-encoders as a form of unsupervised learning, since we are not using the image labels during training. This means that auto-encoders are not ideal for image classification (at least not since we actually have the labels), but they can be used for something else: anomaly detection. This is the task of identifying examples that the model considers "anomalous" with respect to the dataset used during training.
First, I will load in some "anomalous" data:
from sklearn.datasets import fetch_olivetti_faces
faces = fetch_olivetti_faces(shuffle=True)['images']
faces = faces[:,4:-4:2,4:-4:2,np.newaxis]
print("Number of images:",faces.shape[0])
print("Shape:",faces.shape[1:])
print("Pixel intensity range: [{}, {}]".format(faces.min(), faces.max()))
plt.figure(figsize=(10,10))
for i in range(4):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(faces[i])
plt.show()
p_face = cae.predict(faces)
plt.figure(figsize=(10,10))
for i in range(4):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(p_face[i])
plt.show()
Number of images: 400 Shape: (28, 28, 1) Pixel intensity range: [0.020661156624555588, 0.9710744023323059]
Imagine now that we had a dataset comprised of mostly quickdraw images, but also a small subset of anomalies or "outliers"; here, in the form of black-and-white images of faces, with the same shape as the quickdraw images.
# Combined MNIST images and "outliers" in a mixed dataset
mixed = np.vstack((X_test, faces))
# Shuffle the mixed dataset so the "outliers" are randomly distributed
indices = np.random.permutation(mixed.shape[0])
mixed = mixed[indices]
p_mixed = cae.predict(mixed)
eps = np.finfo(float).eps
print(np.clip(p_mixed, eps, 1. - eps).min())
print(p_mixed.min())
2.220446e-16 6.821751e-38
The auto-encoder was trained to minimise the difference between the original and the auto-encoded image, so let's use binary cross-entropy (BCE) as our metric for the difference between an image and its auto-encoded version. The binary_crossentropy
method provided below computes pixel-wise BCE for two (arrays of) images: the input and the output image.
def binary_crossentropy (img_in, img_out):
assert img_in.shape == img_out.shape
eps = np.finfo(float).eps
img_out = np.clip(img_out, eps, 1. - eps)
return - (img_in * np.log(img_out) + (1 - img_in) * np.log(1 - img_out))
m = metrics.BinaryCrossentropy()
from sklearn.metrics import log_loss
log_loss
<function sklearn.metrics._classification.log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None, labels=None)>
def difference (img_in, img_out):
return binary_crossentropy(img_in, img_out).mean(axis=(1,2,3))
binary_crossentropy(mixed, p_mixed).shape
(40400, 28, 28, 1)
binary_crossentropy(mixed, p_mixed).mean(axis=(3))
array([[[2.32458387e-06, 4.76837272e-07, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], ..., [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [2.06611046e-04, 3.87430919e-06, 4.76837272e-07, ..., 1.78813963e-07, 6.61613694e-06, 4.76837272e-07]], [[1.31130298e-06, 4.76837272e-07, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], ..., [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [1.94389620e-04, 3.45707554e-06, 7.15255965e-07, ..., 1.78813963e-07, 6.02008731e-06, 2.98023252e-07]], [[2.05657154e-04, 1.50095759e-04, 6.51500013e-05, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [2.26497900e-06, 5.36443213e-06, 8.40429038e-06, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [1.78813963e-07, 2.38418608e-07, 6.55651320e-07, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], ..., [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [1.07771011e-04, 6.55651320e-07, 4.76837272e-07, ..., 4.76837272e-07, 9.53678864e-06, 1.90735045e-06]], ..., [[1.78813963e-07, 5.96046519e-08, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], ..., [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [1.05625004e-04, 6.55651320e-07, 3.57627925e-07, ..., 2.56300291e-06, 1.85372173e-05, 8.34465368e-07]], [[5.96046618e-07, 3.57627925e-07, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], ..., [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [1.89024198e-04, 3.75509967e-06, 4.17232599e-07, ..., 1.37090774e-06, 9.71560439e-06, 1.49011726e-06]], [[1.78813963e-07, 5.96046519e-08, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], ..., [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [1.99158952e-04, 4.35114862e-06, 7.15255965e-07, ..., 1.78813963e-07, 2.92063169e-06, 7.74860723e-07]]])
def difference (img_in, img_out):
return binary_crossentropy(img_in, img_out).mean(axis=(1,2,3))
p_mixed = cae.predict(mixed)
score = difference(mixed, p_mixed)
bins = np.logspace(-2, 2, 200+1)
plt.hist(score, bins=bins, alpha=0.5);
plt.xscale('log')
plt.xlabel('Difference score');
sorted_indices_scores = sorted(zip(np.arange(score.size), score), key=lambda p: p[1])
best_indices = list(list(zip(*sorted_indices_scores[-9:]))[0])
worst_indices = list(list(zip(*sorted_indices_scores[:9 ]))[0])
plt.figure(figsize=(10,10))
for i in range(len(best_indices)):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(mixed[best_indices[i]])
plt.title("Most efficient encodings");
plt.show()
plt.figure(figsize=(10,10))
for i in range(len(worst_indices)):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(mixed[worst_indices[i]])
plt.title("Least efficient encodings");
plt.show()
From the histogram of difference scores we clearly se two peaks: One, the most populous, with very low (i.e. good) difference scores, and one with very large (i.e. bad) difference scores. The latter are considered the outliers, and the former the inliers. When we plot the most inlier-like and the most outlier-like images, it is clear that simple lines are very easy for the auto-encoder to encode efficiently, whereas (as we suspected) images of faces are consistently poorly reconstructed. This means that if someone shuffled a few faces into you neat pile of hand-written digits, you would be able to automatically pick them out with very large confidence!
z_random = np.random.rand(16, np.prod(encoding_shape))
gen= decoder.predict(z_random)
plt.figure(figsize=(10,10))
for i,img in enumerate(gen):
plt.subplot(4,4,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(img)