Learn practical skills, build real-world projects, and advance your career
from sklearn.datasets import fetch_mldata
from sklearn.metrics import classification_report, confusion_matrix  #only using sklearn to fetch data and summarize model performance
import numpy as np

# import the data via sklearn's fetch_mldata method
mnist = fetch_mldata('MNIST original')
X, y = mnist["data"], mnist["target"]

# scale: this is so all the pixel values are between 0 and 1.This makes it easier for our neural network to learn color is not a factor

X = X / 255

# one-hot encode labels. Converting images to vector form where the ml model is able to better understand them.
num_digits = 10
examples = y.shape[0]
y = y.reshape(1, examples)
Y_new = np.eye(num_digits)[y.astype('int32')]
Y_new = Y_new.T.reshape(num_digits, examples)
# split, reshape, shuffle data
m = 60000
m_test = X.shape[0] - m
X_train, X_test = X[:m].T, X[m:].T
Y_train, Y_test = Y_new[:,:m], Y_new[:,m:]
shuffle_index = np.random.permutation(m)
X_train, Y_train = X_train[:, shuffle_index], Y_train[:, shuffle_index]
'''
defining essential functions: the sigmoid functions converts the output of the neural network to a probability,
the loss function identifies how well our algorithm models our dataset and feed forward algorithm defines that the information moves
forward (in one direction) through the neural network. The Backpropogation algorithm calculates a gradient (through partial 
derivatives) used to compute the weight values which symbolize the strength of each connection between a node in 
one layer and the next layer.
'''

def sigmoid(z):
    s = 1. / (1. + np.exp(-z))
    return s

def compute_loss(Y, Y_hat):

    L_sum = np.sum(np.multiply(Y, np.log(Y_hat)))
    m = Y.shape[1]
    L = -(1./m) * L_sum

    return L

def feed_forward(X, params):

    cache = {}

    cache["Z1"] = np.matmul(params["W1"], X) + params["b1"]
    cache["A1"] = sigmoid(cache["Z1"])
    cache["Z2"] = np.matmul(params["W2"], cache["A1"]) + params["b2"]
    cache["A2"] = np.exp(cache["Z2"]) / np.sum(np.exp(cache["Z2"]), axis=0)

    return cache

def back_propagate(X, Y, params, cache):

    dZ2 = cache["A2"] - Y
    dW2 = (1./m_batch) * np.matmul(dZ2, cache["A1"].T)
    db2 = (1./m_batch) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.matmul(params["W2"].T, dZ2)
    dZ1 = dA1 * sigmoid(cache["Z1"]) * (1 - sigmoid(cache["Z1"]))
    dW1 = (1./m_batch) * np.matmul(dZ1, X.T)
    db1 = (1./m_batch) * np.sum(dZ1, axis=1, keepdims=True)

    grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}

    return grads
np.random.seed(138) # setting a random seed 

# hyperparameters- imagine them as certain specs of our neural network that can be finetuned. 
n_x = X_train.shape[0]
n_h = 64
learning_rate = 4
beta = .9
batch_size = 128
batches = -(-m // batch_size)

# initialization of parameters of the neural netowrk such as weights and biases. This dictionary stores data from forward and backward passes of the neural network
params = { "W1": np.random.randn(n_h, n_x) * np.sqrt(1. / n_x),
           "b1": np.zeros((n_h, 1)) * np.sqrt(1. / n_x),
           "W2": np.random.randn(num_digits, n_h) * np.sqrt(1. / n_h),
           "b2": np.zeros((num_digits, 1)) * np.sqrt(1. / n_h) }

V_dW1 = np.zeros(params["W1"].shape)
V_db1 = np.zeros(params["b1"].shape)
V_dW2 = np.zeros(params["W2"].shape)
V_db2 = np.zeros(params["b2"].shape)

# creating a training loop that iterates over our training and test data
for i in range(9):

    permutation = np.random.permutation(X_train.shape[1])
    X_train_shuffled = X_train[:, permutation]
    Y_train_shuffled = Y_train[:, permutation]

    for j in range(batches):

        begin = j * batch_size
        end = min(begin + batch_size, X_train.shape[1] - 1)
        X = X_train_shuffled[:, begin:end]
        Y = Y_train_shuffled[:, begin:end]
        m_batch = end - begin

        cache = feed_forward(X, params)
        grads = back_propagate(X, Y, params, cache)

        V_dW1 = (beta * V_dW1 + (1. - beta) * grads["dW1"])
        V_db1 = (beta * V_db1 + (1. - beta) * grads["db1"])
        V_dW2 = (beta * V_dW2 + (1. - beta) * grads["dW2"])     #Matrix math from training the neural network (mini batch descent)
        V_db2 = (beta * V_db2 + (1. - beta) * grads["db2"])

        params["W1"] = params["W1"] - learning_rate * V_dW1
        params["b1"] = params["b1"] - learning_rate * V_db1     #keeping momving average of gradients
        params["W2"] = params["W2"] - learning_rate * V_dW2
        params["b2"] = params["b2"] - learning_rate * V_db2

    cache = feed_forward(X_train, params)
    train_cost = compute_loss(Y_train, cache["A2"])
    cache = feed_forward(X_test, params)
    test_cost = compute_loss(Y_test, cache["A2"])
    print("Epoch {}: training cost = {}, test cost = {}".format(i+1 ,train_cost, test_cost))

print("MNIST neural network has been trained.")
Epoch 1: training cost = 0.1835608887547429, test cost = 0.180293014797307 Epoch 2: training cost = 0.09614957570025505, test cost = 0.11067043852582711 Epoch 3: training cost = 0.07519028099736778, test cost = 0.10100170478353242 Epoch 4: training cost = 0.06570659158469966, test cost = 0.09407454356348766 Epoch 5: training cost = 0.05239622125944085, test cost = 0.08976026787416064 Epoch 6: training cost = 0.04648580408051046, test cost = 0.08955359643394464 Epoch 7: training cost = 0.03753157421301977, test cost = 0.08497719402539833 Epoch 8: training cost = 0.031930247908759674, test cost = 0.08517314576771155 Epoch 9: training cost = 0.026021189255454744, test cost = 0.08127909756595492 MNIST neural network has been trained.
cache = feed_forward(X_test, params)
predictions = np.argmax(cache["A2"], axis=0)  #turning the predictions into numbers that have meaning
labels = np.argmax(Y_test, axis=0)

print(classification_report(predictions, labels)) #summary of model performance
precision recall f1-score support 0 0.98 0.98 0.98 984 1 0.99 0.99 0.99 1130 2 0.97 0.98 0.97 1024 3 0.98 0.96 0.97 1035 4 0.97 0.98 0.97 978 5 0.96 0.98 0.97 873 6 0.97 0.98 0.98 953 7 0.98 0.98 0.98 1029 8 0.97 0.96 0.97 978 9 0.97 0.96 0.97 1016 avg / total 0.97 0.97 0.97 10000