Dhairya Kothari

kNN with MNIST-Machine Learning

In [1]:
import os
import gzip
import math
import operator
import sklearn.model_selection
import random

import matplotlib.pyplot as matplot
import matplotlib
%matplotlib inline
import pandas as pd
import numpy as np
import pickle as cPickle

from time import time
from itertools import chain
from collections import Counter
from PIL import Image
from scipy.stats import mode
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

1NN

Importing the data from tensorflow package and checking the shape of the training and testing data set

In [2]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data/')
Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
In [3]:
train = mnist.train.images
validation = mnist.validation.images
test = mnist.test.images

trlab = mnist.train.labels
vallab = mnist.validation.labels
tslab = mnist.test.labels

train = np.concatenate((train, validation), axis=0)
trlab = np.concatenate((trlab, vallab), axis=0)
In [4]:
train.shape
Out[4]:
(60000, 784)
In [5]:
trlab.shape
Out[5]:
(60000,)
In [6]:
test.shape
Out[6]:
(10000, 784)

Example image (28 x 28 pixels)

In [7]:
x = np.reshape(train[0], [28,28])
matplot.imshow(x, cmap='Greys_r')
Out[7]:
<matplotlib.image.AxesImage at 0x19660636518>

Checking the distribution of color (greyscale) in the image

0 = Black ; 1 = White

In [22]:
matplot.subplots(figsize=(12, 8))
matplot.hist(train[0])
matplot.show()

Some example images with their corresponding labels

In [11]:
classes = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
num_classes = len(classes)
samples = 8

matplot.subplots(figsize=(15, 10))
for y, cls in enumerate(classes):
    idxs = np.nonzero([i == y for i in trlab])
    idxs = np.random.choice(idxs[0], samples, replace=False)
    for i , idx in enumerate(idxs):
        plt_idx = i * num_classes + y + 1
        matplot.subplot(samples, num_classes, plt_idx)
        matplot.imshow(train[idx].reshape((28, 28)))
        matplot.axis("off")
        if i == 0:
            matplot.title(cls)
        

matplot.show()

We can see that the machine can confuse 1 with 7 similarly, 9 is very close to 4, 0 to 8 and 2 is very close to 7 in some instances. We might encounter error in prediction in these cases

KNN function

In [12]:
def knn(train, train_label, test, test_label, k):
    pred = []
    for w in range(len(test)):
        test_1 = test[w]
        diff = (train - test_1)
        dist = np.einsum('ij, ij->i', diff, diff)  #distance measure
        nearest_lbs = train_label[np.argsort(dist)[:k]]
        major = mode(nearest_lbs)[0][0]
        pred.append(major)

    cm = pd.DataFrame(confusion_matrix(test_label, pred))
    err = 1-accuracy_score(test_label, pred)
    return [err,cm,pred]
In [11]:
st = time()
Q1 = knn(train,trlab,test,tslab,1)
t = time() - st

Results of 1NN

In [28]:
print(Q1[0]) #Error Rate
0.0309
In [13]:
print(t) #time taken (in seconds) for classifying 10000 test images
print(t/10000) #Query time (in seconds) for 1 image classification
870.1476261615753
0.08701476261615754
In [30]:
print(1-Q1[0]) #Accuracy
0.9691
In [15]:
print(Q1[1]) #Confusion Matrix: Original Label VS Predicted Value
     0     1    2    3    4    5    6    7    8    9
0  973     1    1    0    0    1    3    1    0    0
1    0  1129    3    0    1    1    1    0    0    0
2    7     6  992    5    1    0    2   16    3    0
3    0     1    2  970    1   19    0    7    7    3
4    0     7    0    0  944    0    3    5    1   22
5    1     1    0   12    2  860    5    1    6    4
6    4     2    0    0    3    5  944    0    0    0
7    0    14    6    2    4    0    0  992    0   10
8    6     1    3   14    5   13    3    4  920    5
9    2     5    1    6   10    5    1   11    1  967

Error rate for individual digits

0 - 0.007173
1 - 0.005286
2 - 0.038760
3 - 0.039604
4 - 0.038697
5 - 0.035874
6 - 0.014614
7 - 0.035019
8 - 0.055441
9 - 0.041625

- After implementing the 1NN we get a very good error rate of 3.09 %
- Error is particularly high in 2,3,4,5,7,8 and 9 highest among them is 8
- there is misprediction between 7 with 1; 8 and 5 with 3; 9 with 4; and 7 with 2

KNN with Leave one out Cross validation

In [14]:
def knncv(data, label, klist):
    df = pd.DataFrame(index=range(len(label)), columns=range(len(klist)))
    for p in range(len(label)):
        te = data[p]
        te_lb = label[p]
        tr = np.delete(data, p, 0)
        train_label = np.delete(label, p)

        diff = (tr - te)
        dis = np.einsum('ij, ij->i', diff, diff) 
        for i, k in enumerate(klist):
            near = train_label[np.argsort(dis)[:k]]
            pick = mode(near)[0][0]
            if pick == te_lb:
                df.iloc[p][i] = 0
            else:
                df.iloc[p][i] = 1

    return df

LOOCV kNN for k values 1 to 20

In [56]:
Q2 = knncv(train, trlab, range(1,21))
# print(np.mean(Q2, axis = 0)) # Error rate for K values (1 to 20)
In [57]:
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(Q2, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value')
matplot.show()

K = 3 seems to give the best result

In [58]:
st = time()
Q2a = knn(train,trlab,test,tslab,3)
t = time()-st
print(t/10000) #Query time (in seconds) for 1 image classification for 3NN
0.08965984258651734

Results for 3NN

In [59]:
print(Q2a[0]*100) #Error Rate
2.95
In [60]:
print(Q2a[1]) #Confusion Matrix
     0     1    2    3    4    5    6    7    8    9
0  974     1    1    0    0    1    2    1    0    0
1    0  1133    2    0    0    0    0    0    0    0
2   10     9  996    2    0    0    0   13    2    0
3    0     2    4  976    1   13    1    7    3    3
4    1     6    0    0  950    0    4    2    0   19
5    6     1    0   11    2  859    5    1    3    4
6    5     3    0    0    3    3  944    0    0    0
7    0    21    5    0    1    0    0  991    0   10
8    8     2    4   16    8   11    3    4  914    4
9    4     5    2    8    9    2    1    8    2  968

Error rate for individual digits

0 - 0.006122
1 - 0.001762
2 - 0.034884
3 - 0.033663
4 - 0.032587
5 - 0.036996
6 - 0.014614
7 - 0.035992
8 - 0.061602
9 - 0.040634

- We improved a bit
- After implementing the 3NN we get a better error rate of 2.95 %
- Error is decresed little in 2,3,4,5,7,8 and 9 highest among them is still 8, where error increased
- there is still some misprediction between 7 with 1; 8 and 5 with 3; 9 with 4; and 7 with 2

Importing data

In [1]:
import os
import gzip
import math
import operator
import sklearn.model_selection
import random

import matplotlib.pyplot as matplot
import matplotlib
%matplotlib inline
import pandas as pd
import numpy as np
import pickle as cPickle

from time import time
from itertools import chain
from collections import Counter
from PIL import Image
from scipy.stats import mode
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
In [2]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data/')
Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
In [3]:
train = mnist.train.images
validation = mnist.validation.images
test = mnist.test.images

trlab = mnist.train.labels
vallab = mnist.validation.labels
tslab = mnist.test.labels

train = np.concatenate((train, validation), axis=0)
trlab = np.concatenate((trlab, vallab), axis=0)

KNN function

In [4]:
def knn(train, train_label, test, test_label, k):
    pred = []
    for w in range(len(test)):
        test_1 = test[w]
        diff = (train - test_1)
        dist = np.einsum('ij, ij->i', diff, diff)  #distance measure
        nearest_lbs = train_label[np.argsort(dist)[:k]]
        major = mode(nearest_lbs)[0][0]
        pred.append(major)

    cm = pd.DataFrame(confusion_matrix(test_label, pred))
    err = 1-accuracy_score(test_label, pred)
    return [err,cm,pred]

Defining the error rate for the kNN to display in a tabulated format

In [5]:
def error_rate(confusion_matrix):
    a = confusion_matrix
    b = a.sum(axis=1)
    df = []
    for i in range(0,10):
        temp = 1-a[i][i]/b[i]
        df.append(temp)
    
    df = pd.DataFrame(df)
    df.columns = ['Error rate']
    return df

KNN with Leave-one-out Cross validation

In [6]:
def knncv(data, label, klist):
    df = pd.DataFrame(index=range(len(label)), columns=range(len(klist)))
    for p in range(len(label)):
        te = data[p]
        te_lb = label[p]
        tr = np.delete(data, p, 0)
        train_label = np.delete(label, p)

        diff = (tr - te)
        dis = np.einsum('ij, ij->i', diff, diff) 
        for i, k in enumerate(klist):
            near = train_label[np.argsort(dis)[:k]]
            pick = mode(near)[0][0]
            if pick == te_lb:
                df.iloc[p][i] = 0
            else:
                df.iloc[p][i] = 1

    return df

Generating random sample of the data

In [7]:
#generating a random sequence for sampling
seq = np.random.randint(0,60000,6000)
train_samp = train[seq]
trlab_samp = trlab[seq]

train_samp.shape
trlab_samp.shape
Out[7]:
(6000,)

Comparing sampled data distribution with the original

In [8]:
fig, ax = matplot.subplots(1,2, figsize=(15,7))
ax[0].hist(trlab_samp)
ax[1].hist(trlab)
fig.show
matplot.show()

Downsampling

Defining the downsampling function

In [9]:
def downsamples(n, data):
    rn = range(0,784,n)
    data = data[:, rn]
    return data

N=4 down sampling

In [35]:
q3tr = downsamples(4, train_samp)
q3tr_full = downsamples(4, train)
q3ts = downsamples(4, test)
In [35]:
q3n4 = knncv(q3tr, trlab_samp, range(1,21))
In [37]:
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q3n4, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for downsampling n=4)')
matplot.show()

Best k: k=1

In [36]:
st = time()
q3a = knn(q3tr_full, trlab, q3ts, tslab, 1)
end = time() - st
t3a = end/len(tslab) # Query time
In [37]:
print(q3a[0]) #Error rate
0.074
In [38]:
q3a[1] #Confusion matrix
Out[38]:
0 1 2 3 4 5 6 7 8 9
0 961 5 0 0 0 4 7 1 1 1
1 1 1117 2 0 7 0 5 3 0 0
2 15 12 953 13 5 1 3 15 14 1
3 3 9 7 915 1 25 3 11 25 11
4 0 23 4 1 894 1 6 4 2 47
5 7 8 1 30 5 788 16 4 19 14
6 5 9 0 0 4 13 926 0 1 0
7 1 28 8 0 14 0 1 955 1 20
8 9 14 7 43 12 22 2 12 835 18
9 4 13 2 13 31 5 1 21 3 916
In [39]:
error_rate(q3a[1]) #Individual digit errors
Out[39]:
Error rate
0 0.019388
1 0.015859
2 0.076550
3 0.094059
4 0.089613
5 0.116592
6 0.033403
7 0.071012
8 0.142710
9 0.092170

N=7 down sampling

In [10]:
q3btr = downsamples(7, train_samp)
q3btr_full = downsamples(7, train)
q3bts = downsamples(7, test)
In [11]:
q3n7 = knncv(q3btr, trlab_samp, range(1,21))
In [12]:
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q3n7, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for downsampling n=7)')
matplot.show()

Best k: k = 5

In [13]:
st = time()
q3b = knn(q3btr_full, trlab, q3bts, tslab, 5)
end = time() - st
t3b=end/len(tslab) # Query time
In [14]:
print(q3b[0]) #Error rate
0.1535
In [15]:
q3b[1] #Confusion Matrix
Out[15]:
0 1 2 3 4 5 6 7 8 9
0 956 1 1 5 1 3 9 3 0 1
1 1 1118 3 2 2 2 2 2 0 3
2 28 12 823 32 16 11 45 24 38 3
3 16 3 16 804 1 39 23 9 84 15
4 1 42 7 1 824 8 17 18 9 55
5 21 5 9 75 13 655 29 23 42 20
6 28 16 18 8 19 14 843 6 3 3
7 4 19 14 1 15 4 6 912 10 43
8 7 17 14 126 17 52 13 19 679 30
9 9 15 4 7 31 18 2 53 19 851
In [17]:
error_rate(q3b[1]) #Individual digit errors
Out[17]:
Error rate
0 0.024490
1 0.014978
2 0.202519
3 0.203960
4 0.160896
5 0.265695
6 0.120042
7 0.112840
8 0.302875
9 0.156591

N = 14 downsampling

In [18]:
q3ctr = downsamples(14, train_samp)
q3ctr_full = downsamples(14, train)
q3cts = downsamples(14, test)
In [19]:
q3n14 = knncv(q3ctr, trlab_samp, range(1,21))
In [20]:
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q3n14, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for downsampling n=14)')
matplot.show()

best k: k = 9

In [21]:
st = time()
q3c = knn(q3ctr_full, trlab, q3cts, tslab, 9)
end = time() - st
t3c = end/len(tslab) # Query time
In [22]:
print(q3c[0]) #Error rate
0.2616
In [23]:
q3c[1] #Confusion Matrix
Out[23]:
0 1 2 3 4 5 6 7 8 9
0 913 1 9 10 2 5 16 19 4 1
1 2 1086 8 2 17 2 5 2 9 2
2 18 26 675 56 20 33 85 34 56 29
3 36 1 31 706 2 84 25 4 111 10
4 3 67 35 1 733 11 27 34 13 58
5 46 9 16 116 17 486 63 64 38 37
6 50 13 95 20 44 60 650 15 7 4
7 20 16 41 3 15 18 8 840 18 49
8 4 22 48 201 14 46 17 22 546 54
9 14 11 21 17 28 45 6 72 46 749
In [24]:
error_rate(q3c[1]) #Individual digit errors
Out[24]:
Error rate
0 0.068367
1 0.043172
2 0.345930
3 0.300990
4 0.253564
5 0.455157
6 0.321503
7 0.182879
8 0.439425
9 0.257681

N = 2 downsampling

In [25]:
q3dtr = downsamples(2, train_samp)
q3dtr_full = downsamples(2, train)
q3dts = downsamples(2, test)
In [26]:
q3n2 = knncv(q3dtr, trlab_samp, range(1,21))
In [27]:
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q3n2, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for downsampling n=2)')
matplot.show()

best k: k = 1

In [28]:
st = time()
q3d = knn(q3dtr_full, trlab, q3dts, tslab, 1)
end = time() - st
t3d = end/len(tslab) # Query time
In [29]:
print(q3d[0]) #Error rate
0.0359
In [30]:
q3d[1] #Confusion Matrix
Out[30]:
0 1 2 3 4 5 6 7 8 9
0 973 1 1 0 0 1 3 1 0 0
1 0 1132 2 1 0 0 0 0 0 0
2 7 5 991 6 1 0 3 16 3 0
3 0 2 2 959 1 26 0 9 6 5
4 1 5 0 0 936 0 3 3 2 32
5 1 1 0 15 2 853 7 2 5 6
6 6 4 0 0 4 5 938 0 1 0
7 0 18 7 2 4 0 0 988 0 9
8 4 2 4 19 6 13 3 4 909 10
9 1 4 1 7 13 4 1 12 4 962
In [31]:
error_rate(q3d[1])
Out[31]:
Error rate
0 0.007143
1 0.002643
2 0.039729
3 0.050495
4 0.046843
5 0.043722
6 0.020877
7 0.038911
8 0.066735
9 0.046581
In [40]:
n = [2,4,7,14]
err = [q3d[0],q3a[0],q3b[0],q3c[0]]
t = [t3d,t3a,t3b,t3c]

matplot.subplots(figsize=(20, 10))
matplot.plot(n, err, label="Error") # Blue
matplot.plot(n, t, label="Query time in s") # Orange
matplot.xticks(n,n)
matplot.grid(True)
matplot.legend()
matplot.title('N vs Error (Blue) & N vs Query time in s (Orange)')
matplot.show()
In [49]:
n = [2,4,7,14]
k_val = [1,1,5,9]

matplot.subplots(figsize=(10, 5))
matplot.plot(n, k_val, label="K")
matplot.xticks(n,n)
matplot.legend()
matplot.grid(True)
matplot.title('N vs K-value')
matplot.show()

  • The query time decreases as the downsampling factor N increases
  • As N increases, K increases as well
  • But also, as N increases, the Error rate increases
  • Between N = 2 to N = 4 we have a sweet spot between K, Query time and Error Rate

Smart Down Sampling

In [41]:
def downsample2(n, data):
    res = []
    for i in range(len(data)):
        temp = []
        sample = np.reshape(data[i], (28,28))
        row = np.asarray(np.split(sample, int(28 / n), axis=0))
        for j in range(int(28 / n)):
            col = np.asarray(np.split(row[j], int(28 / n), axis=1))
            add = np.asarray([np.sum(x) for x in col])
            temp.append(np.ndarray.tolist(add))

        res.append(list(chain.from_iterable(temp)))

    ndim = int(28 / n) * int(28 / n)
    data = np.asarray([np.reshape(y, (ndim)) for y in res])
    return data

N = 4 smart sampling

In [42]:
q4atr = downsample2(4, train_samp)
q4atr_full = downsample2(4, train)
q4ats = downsample2(4, test)

Smartsampled image

In [43]:
x1 = np.reshape(q4atr_full[0], [7,7])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()
In [44]:
q4n4 = knncv(q4atr, trlab_samp, range(1,21))
In [45]:
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q4n4, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for smartsampling n=4)')
matplot.show()

best k: k = 1

In [46]:
st = time()
q4a = knn(q4atr_full, trlab, q4ats, tslab, 1)
end = time() - st
t4a = end/len(tslab) # Query time
In [50]:
print(q4a[0]) # Error rate
0.0541
In [51]:
q4a[1] # Confusion matrix
Out[51]:
0 1 2 3 4 5 6 7 8 9
0 970 1 0 0 1 2 3 1 1 1
1 0 1127 0 0 0 0 3 1 3 1
2 1 2 982 5 2 0 4 21 13 2
3 1 1 8 914 0 31 0 11 31 13
4 1 0 0 0 906 0 13 5 4 53
5 4 1 2 33 1 819 12 4 9 7
6 7 3 2 0 5 8 931 0 1 1
7 0 8 12 4 5 0 0 965 3 31
8 6 3 4 13 7 14 3 8 910 6
9 3 7 0 11 23 3 1 19 7 935
In [52]:
error_rate(q4a[1])
Out[52]:
Error rate
0 0.010204
1 0.007048
2 0.048450
3 0.095050
4 0.077393
5 0.081839
6 0.028184
7 0.061284
8 0.065708
9 0.073340

N = 7 smart sampling

In [53]:
q4btr = downsample2(7, train_samp)
q4btr_full = downsample2(7, train)
q4bts = downsample2(7, test)
In [54]:
x1 = np.reshape(q4btr_full[0], [4,4])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()
In [55]:
q4n7 = knncv(q4btr, trlab_samp, range(1,21))
In [56]:
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q4n7, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for smartsampling n=7)')
matplot.show()

best k: k = 7

In [57]:
st = time()
q4b = knn(q4btr_full, trlab, q4bts, tslab, 7)
end = time() - st
t4b = end/len(tslab) # Query time
In [58]:
print(q4b[0]) # Error rate
0.1744
In [59]:
q4b[1] # Confusion matrix
Out[59]:
0 1 2 3 4 5 6 7 8 9
0 798 3 6 5 8 16 15 1 124 4
1 5 1098 1 4 3 1 5 3 10 5
2 23 7 876 46 11 8 25 7 26 3
3 11 15 32 828 2 26 2 34 40 20
4 6 7 11 0 755 17 16 13 2 155
5 19 4 7 39 15 707 17 18 52 14
6 12 10 5 2 14 6 899 0 10 0
7 1 29 10 12 35 7 1 846 2 85
8 140 24 5 23 8 45 8 12 686 23
9 12 16 5 14 108 8 3 69 11 763
In [60]:
error_rate(q4b[1])
Out[60]:
Error rate
0 0.185714
1 0.032599
2 0.151163
3 0.180198
4 0.231161
5 0.207399
6 0.061587
7 0.177043
8 0.295688
9 0.243806

N = 14 super sampling

In [61]:
q4ctr = downsample2(14, train_samp)
q4ctr_full = downsample2(14, train)
q4cts = downsample2(14, test)

x1 = np.reshape(q4ctr_full[0], [2,2])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()
In [62]:
q4n14 = knncv(q4ctr, trlab_samp, range(1,21))
In [63]:
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q4n14, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for smartsampling n=14)')
matplot.show()

best k: k = 20

In [64]:
st = time()
q4c = knn(q4ctr_full, trlab, q4cts, tslab, 20)
end = time() - st
t4c = end/len(tslab) # Query time
In [65]:
print(q4c[0]) # Error rate
0.493
In [66]:
q4c[1] # Confusion matrix
Out[66]:
0 1 2 3 4 5 6 7 8 9
0 698 3 26 33 24 54 20 13 102 7
1 4 999 4 31 22 21 3 8 12 31
2 128 21 489 57 80 27 153 5 37 35
3 84 32 46 416 55 107 21 70 55 124
4 47 25 95 46 410 42 92 38 32 155
5 136 31 21 113 69 281 29 84 88 40
6 58 10 189 11 96 23 542 0 12 17
7 20 49 15 79 29 39 9 665 52 71
8 324 18 13 143 32 109 18 37 231 49
9 61 20 16 151 108 37 17 189 71 339
In [67]:
error_rate(q4c[1])
Out[67]:
Error rate
0 0.287755
1 0.119824
2 0.526163
3 0.588119
4 0.582485
5 0.684978
6 0.434238
7 0.353113
8 0.762834
9 0.664024

N = 2 smart sampling

In [68]:
q4dtr = downsample2(2, train_samp)
q4dtr_full = downsample2(2, train)
q4dts = downsample2(2, test)

x1 = np.reshape(q4dtr_full[0], [14,14])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()
In [69]:
q4n2 = knncv(q4dtr, trlab_samp, range(1,21))
In [70]:
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q4n2, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for smartsampling n=2)')
matplot.show()

best k: k = 1

In [71]:
st = time()
q4d = knn(q4dtr_full, trlab, q4dts, tslab, 1)
end = time() - st
t4d = end/len(tslab) # Query time
In [72]:
print(q4d[0]) # Error rate
0.0283
In [73]:
q4d[1] # Confusion matrix
Out[73]:
0 1 2 3 4 5 6 7 8 9
0 971 1 2 0 0 2 2 1 1 0
1 0 1131 2 0 1 1 0 0 0 0
2 7 1 1003 2 2 0 2 10 5 0
3 1 0 3 966 1 19 0 5 9 6
4 0 1 0 0 949 0 3 3 2 24
5 1 1 0 8 1 865 5 1 7 3
6 5 2 0 1 2 4 944 0 0 0
7 0 12 6 2 6 0 0 990 1 11
8 2 1 4 13 4 10 4 4 929 3
9 1 5 1 6 9 5 1 10 2 969
In [74]:
error_rate(q4d[1])
Out[74]:
Error rate
0 0.009184
1 0.003524
2 0.028101
3 0.043564
4 0.033605
5 0.030269
6 0.014614
7 0.036965
8 0.046201
9 0.039643
In [75]:
n = [2,4,7,14]
err = [q4d[0],q4a[0],q4b[0],q4c[0]]
t = [t4d,t4a,t4b,t4c]

matplot.subplots(figsize=(20, 10))
matplot.plot(n, err, label="Error") # Blue
matplot.plot(n, t, label="Query time in s") # Orange
matplot.xticks(n,n)
matplot.grid(True)
matplot.legend()
matplot.title('N vs Error (Blue) & N vs Query time in s (Orange)')
matplot.show()
In [77]:
n = [2,4,7,14]
k_val = [1,1,7,20]

matplot.subplots(figsize=(10, 5))
matplot.plot(n, k_val, label="K")
matplot.xticks(n,n)
matplot.yticks(k_val, k_val)
matplot.legend()
matplot.grid(True)
matplot.title('N vs K-value')
matplot.show()

  • The query time decreases as the smartsampling factor N increases
  • But, as N increases, K increases as well
  • Also, as N increases, the Error rate increases
  • Between N = 2 to N = 4 we have a sweet spot between K, Query time and Error Rate
  • Note that, k=1 for N=2 and N=4

N=28 smart sampling

In [78]:
q5tr = downsample2(28, train_samp)
q5tr_full = downsample2(28, train)
q5ts = downsample2(28, test)

x1 = np.reshape(q5tr_full[0], [1,1])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()
In [79]:
q5k = knncv(q5tr, trlab_samp, range(1,21))
In [80]:
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q5k, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for smartsampling n=28)')
matplot.show()

k = 1 has the better accuracy

In [81]:
st = time()
q5 = knn(q5tr_full, trlab, q5ts, tslab, 1)
end = time() - st
print(end/len(tslab)) # Query time
0.004072939896583557
In [82]:
print(q5[0]) # Error rate
0.8324
In [83]:
q5[1] # Confusion matrix
Out[83]:
0 1 2 3 4 5 6 7 8 9
0 224 9 130 110 58 92 100 60 123 74
1 15 533 36 57 114 85 72 114 27 82
2 131 38 131 123 93 100 101 113 121 81
3 106 50 128 121 102 91 102 106 115 89
4 64 81 99 92 128 89 105 130 81 113
5 73 52 101 109 96 85 93 98 90 95
6 123 48 108 114 98 85 85 101 100 96
7 65 110 81 107 123 106 97 140 86 113
8 148 17 115 124 89 75 119 77 120 90
9 69 60 99 109 103 101 140 114 105 109
In [84]:
error_rate(q5[1])
Out[84]:
Error rate
0 0.771429
1 0.530396
2 0.873062
3 0.880198
4 0.869654
5 0.904709
6 0.911273
7 0.863813
8 0.876797
9 0.891972

  • When we effectively reduce the size down to only 1 pixel, it takes a heavy toll on the accuracy, even though the query time decreases significantly

Out of all the different downsampling methods I would use SmartSampling with N=2 and K=1

  • It has the best blend of query time and accuracy (2.83 %)

Importing data

In [3]:
import os
import gzip
import math
import operator
import sklearn.model_selection
import random

import matplotlib.pyplot as matplot
import matplotlib
%matplotlib inline
import pandas as pd
import numpy as np
import pickle as cPickle

from time import time
from itertools import chain
from collections import Counter
from PIL import Image
from scipy.stats import mode
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
In [4]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data/')
Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
In [5]:
train = mnist.train.images
validation = mnist.validation.images
test = mnist.test.images

trlab = mnist.train.labels
vallab = mnist.validation.labels
tslab = mnist.test.labels

train = np.concatenate((train, validation), axis=0)
trlab = np.concatenate((trlab, vallab), axis=0)

KNN function

In [6]:
def knn(train, train_label, test, test_label, k):
    pred = []
    for w in range(len(test)):
        test_1 = test[w]
        diff = (train - test_1)
        dist = np.einsum('ij, ij->i', diff, diff)  #distance measure
        nearest_lbs = train_label[np.argsort(dist)[:k]]
        major = mode(nearest_lbs)[0][0]
        pred.append(major)

    cm = pd.DataFrame(confusion_matrix(test_label, pred))
    err = 1-accuracy_score(test_label, pred)
    return [err,cm,pred]

Defining the error rate for the kNN to display in a tabulated format

In [7]:
def error_rate(confusion_matrix):
    a = confusion_matrix
    b = a.sum(axis=1)
    df = []
    for i in range(0,10):
        temp = 1-a[i][i]/b[i]
        df.append(temp)
    
    df = pd.DataFrame(df)
    df.columns = ['Error rate']
    return df

KNN with Leave-one-out Cross validation

In [8]:
def knncv(data, label, klist):
    df = pd.DataFrame(index=range(len(label)), columns=range(len(klist)))
    for p in range(len(label)):
        te = data[p]
        te_lb = label[p]
        tr = np.delete(data, p, 0)
        train_label = np.delete(label, p)

        diff = (tr - te)
        dis = np.einsum('ij, ij->i', diff, diff) 
        for i, k in enumerate(klist):
            near = train_label[np.argsort(dis)[:k]]
            pick = mode(near)[0][0]
            if pick == te_lb:
                df.iloc[p][i] = 0
            else:
                df.iloc[p][i] = 1

    return df

Generating random sample of the data

In [9]:
#generating a random sequence for sampling
seq = np.random.randint(0,60000,6000)
train_samp = train[seq]
trlab_samp = trlab[seq]

train_samp.shape
trlab_samp.shape
Out[9]:
(6000,)
In [17]:
seq = np.random.randint(0,10000,1000)
ts_samp = test[seq]
tslab_samp = tslab[seq]

Comparing sampled data distribution with the original

In [10]:
fig, ax = matplot.subplots(1,2, figsize=(10,5))
ax[0].hist(trlab_samp)
ax[1].hist(trlab)
fig.show
matplot.show()

Comparing test samples as well

In [57]:
fig, ax = matplot.subplots(1,2, figsize=(8,3))
ax[0].hist(tslab_samp)
ax[1].hist(tslab)
fig.show
matplot.show()

Convert greyscale to black and white

We see that all the images are in greyscale, we can put a threshold and convert the individual pixel values into 0 and 1, which will effectively render them black and white. Lets try with one image for example, and see the distribution

In [11]:
temp = train[:,]>0.75
matplot.hist(temp[0])
matplot.show()

Now seeing as the transformation works, we shall apply the transformation to the whole training and testing set. thing to note is the black and white image is converted float (though only 0 and 1) as the original data was float.

Black and white function

In [12]:
def bnw(threshold, data):
    newdata = data[:,]>threshold
    return newdata

case (a): threshold = 0.75

In [20]:
q6tr_full = bnw(0.75, train)
q6ts = bnw(0.75, ts_samp)
q6tr = bnw(0.75, train_samp)

x1 = np.reshape(q6tr_full[0], [28,28])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()
In [14]:
q6k = knncv(q6tr, trlab_samp, range(1,21))
print(np.mean(q6k, axis = 0))
0     0.813833
1     0.855167
2     0.866167
3     0.868667
4     0.882667
5     0.882833
6     0.886667
7     0.882500
8     0.880000
9     0.883000
10    0.887333
11    0.895333
12    0.896500
13    0.893833
14    0.890667
15    0.893167
16    0.896500
17    0.899333
18    0.900000
19    0.900000
dtype: float64
In [15]:
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q6k, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (b n w threshold = 0.75)')
matplot.show()

best k: k = 1

In [21]:
st = time()
q6a = knn(q6tr, trlab_samp, q6ts, tslab_samp, 1)
end = time() - st
t6a = end/len(tslab_samp) # Query time
In [22]:
print(q6a[0]) #Error rate
0.916
In [23]:
q6a[1] #Confusion matrix
Out[23]:
0 1 2 3 4 5 6 7 8 9
0 0 0 0 0 0 87 0 0 0 0
1 0 0 0 0 0 116 0 0 0 0
2 0 0 0 0 0 100 0 0 0 0
3 0 0 0 0 0 115 0 0 0 0
4 0 0 0 0 0 99 0 0 0 0
5 0 0 0 0 0 84 0 0 0 0
6 0 0 0 0 0 101 0 0 0 0
7 0 0 0 0 0 103 0 0 0 0
8 0 0 0 0 0 96 0 0 0 0
9 0 0 0 0 0 99 0 0 0 0
In [24]:
error_rate(q6a[1])
Out[24]:
Error rate
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
5 0.0
6 1.0
7 1.0
8 1.0
9 1.0

case (b): threshold = 0.40

In [25]:
q6tr_full = bnw(0.40, train)
q6ts = bnw(0.40, ts_samp)
q6tr = bnw(0.40, train_samp)

x1 = np.reshape(q6tr_full[0], [28,28])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()
In [28]:
q6b1 = knncv(q6tr, trlab_samp, range(1,21))
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q6b1, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (b n w threshold = 0.40)')
matplot.show()

best k: k=1

In [29]:
st = time()
q6b = knn(q6tr, trlab_samp, q6ts, tslab_samp, 1)
end = time() - st
t6b = end/len(tslab_samp) # Query time
In [30]:
print(q6b[0]) #Error rate
0.916
In [31]:
q6b[1] #Confusion matrix
Out[31]:
0 1 2 3 4 5 6 7 8 9
0 0 0 0 0 0 87 0 0 0 0
1 0 0 0 0 0 116 0 0 0 0
2 0 0 0 0 0 100 0 0 0 0
3 0 0 0 0 0 115 0 0 0 0
4 0 0 0 0 0 99 0 0 0 0
5 0 0 0 0 0 84 0 0 0 0
6 0 0 0 0 0 101 0 0 0 0
7 0 0 0 0 0 103 0 0 0 0
8 0 0 0 0 0 96 0 0 0 0
9 0 0 0 0 0 99 0 0 0 0
In [32]:
error_rate(q6b[1])
Out[32]:
Error rate
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
5 0.0
6 1.0
7 1.0
8 1.0
9 1.0

case (c): threshold = 0.50

In [33]:
q6tr_full = bnw(0.50, train)
q6ts = bnw(0.50, ts_samp)
q6tr = bnw(0.50, train_samp)

x1 = np.reshape(q6tr_full[0], [28,28])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()
In [34]:
q6c1 = knncv(q6tr, trlab_samp, range(1,21))
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q6c1, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (b n w threshold = 0.50)')
matplot.show()

best k: k=1

In [35]:
st = time()
q6c = knn(q6tr, trlab_samp, q6ts, tslab_samp, 1)
end = time() - st
t6c = end/len(tslab_samp) # Query time
In [36]:
print(q6c[0]) #Error rate
0.916
In [37]:
q6c[1] #Confusion matrix
Out[37]:
0 1 2 3 4 5 6 7 8 9
0 0 0 0 0 0 87 0 0 0 0
1 0 0 0 0 0 116 0 0 0 0
2 0 0 0 0 0 100 0 0 0 0
3 0 0 0 0 0 115 0 0 0 0
4 0 0 0 0 0 99 0 0 0 0
5 0 0 0 0 0 84 0 0 0 0
6 0 0 0 0 0 101 0 0 0 0
7 0 0 0 0 0 103 0 0 0 0
8 0 0 0 0 0 96 0 0 0 0
9 0 0 0 0 0 99 0 0 0 0
In [38]:
error_rate(q6c[1])
Out[38]:
Error rate
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
5 0.0
6 1.0
7 1.0
8 1.0
9 1.0

case (d): threshold = 0.60

In [39]:
q6tr_full = bnw(0.60, train)
q6ts = bnw(0.60, ts_samp)
q6tr = bnw(0.60, train_samp)

x1 = np.reshape(q6tr_full[0], [28,28])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()
In [40]:
q6d1 = knncv(q6tr, trlab_samp, range(1,21))
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q6d1, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (b n w threshold = 0.60)')
matplot.show()

best k: k=1

In [41]:
st = time()
q6d = knn(q6tr, trlab_samp, q6ts, tslab_samp, 1)
end = time() - st
t6d = end/len(tslab_samp) # Query time
In [42]:
print(q6d[0]) #Error rate
0.916
In [43]:
q6d[1] #Confusion matrix
Out[43]:
0 1 2 3 4 5 6 7 8 9
0 0 0 0 0 0 87 0 0 0 0
1 0 0 0 0 0 116 0 0 0 0
2 0 0 0 0 0 100 0 0 0 0
3 0 0 0 0 0 115 0 0 0 0
4 0 0 0 0 0 99 0 0 0 0
5 0 0 0 0 0 84 0 0 0 0
6 0 0 0 0 0 101 0 0 0 0
7 0 0 0 0 0 103 0 0 0 0
8 0 0 0 0 0 96 0 0 0 0
9 0 0 0 0 0 99 0 0 0 0
In [44]:
error_rate(q6d[1])
Out[44]:
Error rate
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
5 0.0
6 1.0
7 1.0
8 1.0
9 1.0

case (e): threshold = 0.90

In [45]:
q6tr_full = bnw(0.90, train)
q6ts = bnw(0.90, ts_samp)
q6tr = bnw(0.90, train_samp)

x1 = np.reshape(q6tr_full[0], [28,28])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()
In [46]:
q6e1 = knncv(q6tr, trlab_samp, range(1,21))
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q6e1, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (b n w threshold = 0.50)')
matplot.show()

best k: k=1

In [47]:
st = time()
q6e = knn(q6tr, trlab_samp, q6ts, tslab_samp, 1)
end = time() - st
t6e = end/len(tslab_samp) # Query time
In [48]:
print(q6e[0]) #Error rate
0.914
In [49]:
q6e[1] #Confusion matrix
Out[49]:
0 1 2 3 4 5 6 7 8 9
0 0 0 0 0 0 87 0 0 0 0
1 0 2 0 0 0 114 0 0 0 0
2 0 0 0 0 0 100 0 0 0 0
3 0 0 0 0 0 115 0 0 0 0
4 0 0 0 0 0 99 0 0 0 0
5 0 0 0 0 0 84 0 0 0 0
6 0 0 0 0 0 101 0 0 0 0
7 0 0 0 0 0 103 0 0 0 0
8 0 0 0 0 0 96 0 0 0 0
9 0 0 0 0 0 99 0 0 0 0
In [50]:
error_rate(q6e[1])
Out[50]:
Error rate
0 1.000000
1 0.982759
2 1.000000
3 1.000000
4 1.000000
5 0.000000
6 1.000000
7 1.000000
8 1.000000
9 1.000000
In [52]:
thres = [0.40,0.50,0.60,0.75,0.90]
err = [q6b[0],q6c[0],q6d[0],q6a[0],q6e[0]]
t = [t6b,t6c,t6d,t6a,t6e]

matplot.subplots(figsize=(20, 10))
matplot.plot(thres, err, label="Error") # Blue
matplot.plot(thres, t, label="Query time in s") # Orange
matplot.xticks(thres,thres)
matplot.grid(True)
matplot.legend()
matplot.title('Threshold vs Error (Blue) & Threshold vs Query time in s (Orange)')
matplot.show()

  • Very wierd results
  • Everything is classified as 5
  • No matter the Threshold, Error and Query time (with this method) remains constant
  • Query time is no better than previous downsampling techniques
  • Also to note that K=1 for all the thresholds
    Striclty recomend not using this technique and develop further (better) downsampling techniques