Naive Bayes is a probablity generative model where in we define probablity for and against one class (binary classifier). We will produce 10 models for all 10 digit class then predict the class on taking the maximum probablity of all the classes for a given digit.
Approach for this method:
Importing packages and data
from scipy.stats import mode
import numpy as np
#from mnist import MNIST
from time import time
import pandas as pd
import os
import matplotlib.pyplot as matplot
import matplotlib
%matplotlib inline
import random
matplot.rcdefaults()
from IPython.display import display, HTML
from itertools import chain
from sklearn.metrics import confusion_matrix
import seaborn as sb
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data/')
train = mnist.train.images
validation = mnist.validation.images
test = mnist.test.images
trlab = mnist.train.labels
vallab = mnist.validation.labels
tslab = mnist.test.labels
train = np.concatenate((train, validation), axis=0)
trlab = np.concatenate((trlab, vallab), axis=0)
train = train * 255
test = test * 255
Here we see that the Data is 0-1 normalized, so multiplying by 255(8-bit color value) we would get a normalized data with 0-255 as min-max.
We Assume that the probability model for each pixel is Gaussian and that the probability of each class—i.e., digit—is equal. That is, P(c = 0) = P(c = 1) = : : : = P(c = 9). Let x = (x1; x2; : : : ; x784) be the vector of pixel values for a given image and c is the class or digit, 0 to 9. Hence for comparison we eliminate the Pr(C).
def naivebayes(train, train_lb, test, test_lb, smoothing):
n_class = np.unique(train_lb)
tr = train
te = test
tr_lb = train_lb
te_lb = test_lb
smoothing = smoothing
st = time()
m, s, prior, count = [], [], [], []
for i, val in enumerate(n_class):
sep = [tr_lb == val]
count.append(len(tr_lb[sep]))
prior.append(len(tr_lb[sep]) / len(tr_lb))
m.append(np.mean(tr[sep], axis=0))
s.append(np.std(tr[sep], axis=0))
pred = []
likelihood = []
#prtab = []
lcs = []
for n in range(len(te_lb)):
classifier = []
sample = te[n] #test sample
ll = []
for i, val in enumerate(n_class):
m1 = m[i]
var = np.square(s[i]) + smoothing
prob = 1 / np.sqrt(2 * np.pi * var) * np.exp(-np.square(sample - m1)/(2 * var))
#prtab.append(prob)
result = np.sum(np.log(prob))
classifier.append(result)
ll.append(prob)
pred.append(np.argmax(classifier))
likelihood.append(ll)
lcs.append(classifier)
return pred, likelihood
All the probablities computed in Naive Bayes are very small, almost neglegent. So to avoid multiplying into zero values, we take a log and then sum all the values
def error_rate(confusion_matrix):
a = confusion_matrix
b = a.sum(axis=1)
df = []
for i in range(0,10):
temp = 1-a[i][i]/b[i]
df.append(temp)
df = pd.DataFrame(df)
df.columns = ['% Error rate']
return df*100
nb = naivebayes(train=train, train_lb=trlab, test=test, test_lb=tslab, smoothing=1000)
nb_pred = nb[0]
print("Test Accuracy:", round((sum(np.diagonal(cm)) / len(nb_pred)) * 100, 4), '%')
cm = confusion_matrix(tslab, nb_pred)
#cm # X-axis Predicted vs Y-axis Actual Values
matplot.subplots(figsize=(10, 6))
sb.heatmap(cm, annot = True, fmt = 'g')
matplot.xlabel("Predicted")
matplot.ylabel("Actual")
matplot.title("Confusion Matrix")
matplot.show()
From this we can see the possible miss classifications like, 4 being missclassified as 9 (alot); even though 5 is missclassified the most time
error_rate(cm)
likeli = nb[1]
likli = likeli[9999]
matplot.subplots(2,5, figsize=(24,10))
for i in range(10):
l1 = matplot.subplot(2, 5, i + 1)
l1.imshow(likli[i].reshape(28, 28), interpolation='nearest',cmap=matplot.cm.RdBu)
l1.set_xticks(())
l1.set_yticks(())
l1.set_xlabel('Class %i' % i)
matplot.suptitle('Conditional probability images for each class (according to Likelihood): Rightly Classified as 6')
matplot.show()
likli = likeli[9998]
matplot.subplots(2,5, figsize=(24,10))
for i in range(10):
l1 = matplot.subplot(2, 5, i + 1)
l1.imshow(likli[i].reshape(28, 28), interpolation='nearest',cmap=matplot.cm.RdBu)
l1.set_xticks(())
l1.set_yticks(())
l1.set_xlabel('Class %i' % i)
matplot.suptitle('Conditional prob images for each class (according to Likelihood): Wrongly Classified as 8, when it is 5')
matplot.show()
We can see the underlying numbers of the class and the test number overlayed on top to get the probabilities, all these class probabilties are blurred as we used guassian. I would say that Naive Bayes puts the number on top of the class to see if there is a match.
Naive bayes gives pretty low accuracy as compared to kNN (from the last assignment). The reason being, it consideres all the probablities as independent of each other (hence- naive). We can also see the probablity images of the samples, that it is difficult to differenciate between the classes with naive bayes probability