Basics of Machine Learning¶

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.models as models
import tensorflow.keras.layers as layers
import tensorflow.keras.losses as losses
import tensorflow.keras.optimizers as optimizers
import tensorflow.keras.datasets as datasets

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.models as models
import tensorflow.keras.layers as layers
import tensorflow.keras.losses as losses
import tensorflow.keras.optimizers as optimizers
import tensorflow.keras.datasets as datasets

import numpy as np
import matplotlib.pyplot as pl

import numpy as np
import matplotlib.pyplot as pl

Dataset¶

The MNIST is a collection of 100,000 handwritten single digits images, collected from the US Postal Service.

Each image is 28x28 in dimension, and all 100,000 images are properly labeled.

data = tf.keras.datasets.mnist.load_data(path='/data/shared/datasets/mnist.npz')

#
# By default, the data is already split into training and testing data
#
train_data, test_data = data

#
# Data consists of one tensor for the images, and another tensor for the label.
#
x_train, y_train = train_data
x_test, y_test = test_data

#
# We will also renormalize the input to 0-1
#
x_train = x_train / 255
x_test = x_test / 255

print('x_train', x_train.shape)
print('y_train', y_train.shape)

x_train (60000, 28, 28)
y_train (60000,)

data = tf.keras.datasets.mnist.load_data(path='/data/shared/datasets/mnist.npz')

#
# By default, the data is already split into training and testing data
#
train_data, test_data = data

#
# Data consists of one tensor for the images, and another tensor for the label.
#
x_train, y_train = train_data
x_test, y_test = test_data

#
# We will also renormalize the input to 0-1
#
x_train = x_train / 255
x_test = x_test / 255

print('x_train', x_train.shape)
print('y_train', y_train.shape)

x_train (60000, 28, 28)
y_train (60000,)

We can inspect the dataset by plotting some of the image, and their labels.

def show_images(k, x, y=None):
    pl.figure(figsize=(10,10))
    pl.subplots(k, k)
    pl.tight_layout(pad=0.3)
    for i in range(k*k):
        image = x[i]
        pl.subplot(k, k, i+1)
        pl.imshow(image, cmap='gray')
        pl.axis('off')
        if y is not None:
            pl.title("y=%s" % y[i])
        
show_images(3, x_train, y_train)

<Figure size 720x720 with 0 Axes>

def show_images(k, x, y=None):
    pl.figure(figsize=(10,10))
    pl.subplots(k, k)
    pl.tight_layout(pad=0.3)
    for i in range(k*k):
        image = x[i]
        pl.subplot(k, k, i+1)
        pl.imshow(image, cmap='gray')
        pl.axis('off')
        if y is not None:
            pl.title("y=%s" % y[i])
        
show_images(3, x_train, y_train)

<Figure size 720x720 with 0 Axes>

We can also use NumPy to handpick images with a particular label.

I = y_train == 5

show_images(3, x_train[I])

<Figure size 720x720 with 0 Axes>

I = y_train == 5

show_images(3, x_train[I])

<Figure size 720x720 with 0 Axes>

Model¶

Let's train a multi-class classifier.

Flatten the images to 784 vectors.
Dense layer to transform to (10,1)
Use softmax to rescale to probability.
Use cross-entropy as the loss function.

model = models.Sequential([
    layers.Input(shape=(28, 28)),
    layers.Reshape((784,)),
    layers.Dense(10, activation='softmax'),
])

model = models.Sequential([
    layers.Input(shape=(28, 28)),
    layers.Reshape((784,)),
    layers.Dense(10, activation='softmax'),
])

model.compile(
    optimizer=optimizers.Adam(0.1),
    loss=losses.SparseCategoricalCrossentropy(),
    metrics=['acc'],
)

model.compile(
    optimizer=optimizers.Adam(0.1),
    loss=losses.SparseCategoricalCrossentropy(),
    metrics=['acc'],
)

Training¶

We choose a small batch size so that the training loop is memory friendly.

Each epoch must scan through the entire training data once. Since we have 60,000 training samples, and a batch size of 64, the model is adjusted $60000/64 = 938$ times.

model.fit(x_train, y_train, batch_size=64, epochs=3)

Epoch 1/3
938/938 [==============================] - 16s 16ms/step - loss: 1.1932 - acc: 0.8334
Epoch 2/3
938/938 [==============================] - 15s 16ms/step - loss: 1.1349 - acc: 0.8844
Epoch 3/3
938/938 [==============================] - 16s 17ms/step - loss: 1.2654 - acc: 0.8858

<tensorflow.python.keras.callbacks.History at 0x7fd21055e8e0>

model.fit(x_train, y_train, batch_size=64, epochs=3)

Epoch 1/3
938/938 [==============================] - 16s 16ms/step - loss: 1.1932 - acc: 0.8334
Epoch 2/3
938/938 [==============================] - 15s 16ms/step - loss: 1.1349 - acc: 0.8844
Epoch 3/3
938/938 [==============================] - 16s 17ms/step - loss: 1.2654 - acc: 0.8858

<tensorflow.python.keras.callbacks.History at 0x7fd21055e8e0>

Testing¶

It's very important that we test how well the model does.

Since the training data is used to the model, the training accuracy is a biased estimator of how the model functions in the real world.

For this reason, we withheld 40,000 samples from training. This is called test data.

model.evaluate(x_test, y_test)

313/313 [==============================] - 2s 7ms/step - loss: 1.3797 - acc: 0.8885

[1.3796886205673218, 0.8884999752044678]

model.evaluate(x_test, y_test)

313/313 [==============================] - 2s 7ms/step - loss: 1.3797 - acc: 0.8885

[1.3796886205673218, 0.8884999752044678]

Observation

The model achieved similar accuracy for both the training data and the test data.

This suggests that the model did not develop any bias during training.

Prediction¶

After training and validation, the model can be used like any function in your program.

Cross digit similarity

Let's consider a problem of finding which other digit is generally more similar to the digit 5.

We solve this using the model by looking for the digit that is most often misidentified as 5.

def misclassify(x, y, source_digit, target_digit):
    I = y == source_digit
    z = model.predict(x[I])
    y_pred = np.argmax(z, axis=1)
    return np.sum(y_pred == target_digit) / y_pred.shape[0] * 100

def misclassify(x, y, source_digit, target_digit):
    I = y == source_digit
    z = model.predict(x[I])
    y_pred = np.argmax(z, axis=1)
    return np.sum(y_pred == target_digit) / y_pred.shape[0] * 100

misclassify(x_test, y_test, 6, 5)

3.7578288100208765

misclassify(x_test, y_test, 6, 5)

3.7578288100208765

sorted([(misclassify(x_test, y_test, i, 5), i) for i in range(10)], reverse=1)

[(81.16591928251121, 5),
 (3.7578288100208765, 6),
 (2.8747433264887063, 8),
 (1.1224489795918366, 0),
 (0.891089108910891, 3),
 (0.6937561942517344, 9),
 (0.4844961240310077, 2),
 (0.20366598778004072, 4),
 (0.09727626459143969, 7),
 (0.0881057268722467, 1)]

sorted([(misclassify(x_test, y_test, i, 5), i) for i in range(10)], reverse=1)

[(81.16591928251121, 5),
 (3.7578288100208765, 6),
 (2.8747433264887063, 8),
 (1.1224489795918366, 0),
 (0.891089108910891, 3),
 (0.6937561942517344, 9),
 (0.4844961240310077, 2),
 (0.20366598778004072, 4),
 (0.09727626459143969, 7),
 (0.0881057268722467, 1)]

Worst hand-written digits

We can use the model to identiy the worst digits.

This is done by looking at the samples where the model assigns low probability to the true label.

def worst_digits(x, y, digit, topk):
    I = y == digit
    z = model.predict(x[I])
    J = np.argsort(z[:, digit])
    return x[I][J][:topk]

def worst_digits(x, y, digit, topk):
    I = y == digit
    z = model.predict(x[I])
    J = np.argsort(z[:, digit])
    return x[I][J][:topk]

k = 3
digit = 0
x_worst = worst_digits(x_test, y_test, 0, k*k)
show_images(k, x_worst)

<Figure size 720x720 with 0 Axes>

k = 3
digit = 0
x_worst = worst_digits(x_test, y_test, 0, k*k)
show_images(k, x_worst)

<Figure size 720x720 with 0 Axes>

Index