Dhairya Kothari¶

kNN with MNIST-Machine Learning¶

import os
import gzip
import math
import operator
import sklearn.model_selection
import random

import matplotlib.pyplot as matplot
import matplotlib
%matplotlib inline
import pandas as pd
import numpy as np
import pickle as cPickle

from time import time
from itertools import chain
from collections import Counter
from PIL import Image
from scipy.stats import mode
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

1NN¶

Importing the data from tensorflow package and checking the shape of the training and testing data set¶

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data/')

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz

train = mnist.train.images
validation = mnist.validation.images
test = mnist.test.images

trlab = mnist.train.labels
vallab = mnist.validation.labels
tslab = mnist.test.labels

train = np.concatenate((train, validation), axis=0)
trlab = np.concatenate((trlab, vallab), axis=0)

train.shape

(60000, 784)

trlab.shape

(60000,)

test.shape

(10000, 784)

Example image (28 x 28 pixels)¶

x = np.reshape(train[0], [28,28])
matplot.imshow(x, cmap='Greys_r')

<matplotlib.image.AxesImage at 0x19660636518>

Checking the distribution of color (greyscale) in the image¶

0 = Black ; 1 = White

matplot.subplots(figsize=(12, 8))
matplot.hist(train[0])
matplot.show()

Some example images with their corresponding labels¶

classes = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
num_classes = len(classes)
samples = 8

matplot.subplots(figsize=(15, 10))
for y, cls in enumerate(classes):
    idxs = np.nonzero([i == y for i in trlab])
    idxs = np.random.choice(idxs[0], samples, replace=False)
    for i , idx in enumerate(idxs):
        plt_idx = i * num_classes + y + 1
        matplot.subplot(samples, num_classes, plt_idx)
        matplot.imshow(train[idx].reshape((28, 28)))
        matplot.axis("off")
        if i == 0:
            matplot.title(cls)
        

matplot.show()

We can see that the machine can confuse 1 with 7 similarly, 9 is very close to 4, 0 to 8 and 2 is very close to 7 in some instances. We might encounter error in prediction in these cases

KNN function¶

def knn(train, train_label, test, test_label, k):
    pred = []
    for w in range(len(test)):
        test_1 = test[w]
        diff = (train - test_1)
        dist = np.einsum('ij, ij->i', diff, diff)  #distance measure
        nearest_lbs = train_label[np.argsort(dist)[:k]]
        major = mode(nearest_lbs)[0][0]
        pred.append(major)

    cm = pd.DataFrame(confusion_matrix(test_label, pred))
    err = 1-accuracy_score(test_label, pred)
    return [err,cm,pred]

st = time()
Q1 = knn(train,trlab,test,tslab,1)
t = time() - st

Results of 1NN¶

print(Q1[0]) #Error Rate

0.0309

print(t) #time taken (in seconds) for classifying 10000 test images
print(t/10000) #Query time (in seconds) for 1 image classification

870.1476261615753
0.08701476261615754

print(1-Q1[0]) #Accuracy

0.9691

print(Q1[1]) #Confusion Matrix: Original Label VS Predicted Value

     0     1    2    3    4    5    6    7    8    9
0  973     1    1    0    0    1    3    1    0    0
1    0  1129    3    0    1    1    1    0    0    0
2    7     6  992    5    1    0    2   16    3    0
3    0     1    2  970    1   19    0    7    7    3
4    0     7    0    0  944    0    3    5    1   22
5    1     1    0   12    2  860    5    1    6    4
6    4     2    0    0    3    5  944    0    0    0
7    0    14    6    2    4    0    0  992    0   10
8    6     1    3   14    5   13    3    4  920    5
9    2     5    1    6   10    5    1   11    1  967

Error rate for individual digits¶

0 - 0.007173
1 - 0.005286
2 - 0.038760
3 - 0.039604
4 - 0.038697
5 - 0.035874
6 - 0.014614
7 - 0.035019
8 - 0.055441
9 - 0.041625

- After implementing the 1NN we get a very good error rate of 3.09 %
- Error is particularly high in 2,3,4,5,7,8 and 9 highest among them is 8
- there is misprediction between 7 with 1; 8 and 5 with 3; 9 with 4; and 7 with 2

KNN with Leave one out Cross validation¶

def knncv(data, label, klist):
    df = pd.DataFrame(index=range(len(label)), columns=range(len(klist)))
    for p in range(len(label)):
        te = data[p]
        te_lb = label[p]
        tr = np.delete(data, p, 0)
        train_label = np.delete(label, p)

        diff = (tr - te)
        dis = np.einsum('ij, ij->i', diff, diff) 
        for i, k in enumerate(klist):
            near = train_label[np.argsort(dis)[:k]]
            pick = mode(near)[0][0]
            if pick == te_lb:
                df.iloc[p][i] = 0
            else:
                df.iloc[p][i] = 1

    return df

LOOCV kNN for k values 1 to 20¶

Q2 = knncv(train, trlab, range(1,21))
# print(np.mean(Q2, axis = 0)) # Error rate for K values (1 to 20)

matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(Q2, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value')
matplot.show()

K = 3 seems to give the best result¶

st = time()
Q2a = knn(train,trlab,test,tslab,3)
t = time()-st
print(t/10000) #Query time (in seconds) for 1 image classification for 3NN

0.08965984258651734

Results for 3NN¶

print(Q2a[0]*100) #Error Rate

2.95

print(Q2a[1]) #Confusion Matrix

     0     1    2    3    4    5    6    7    8    9
0  974     1    1    0    0    1    2    1    0    0
1    0  1133    2    0    0    0    0    0    0    0
2   10     9  996    2    0    0    0   13    2    0
3    0     2    4  976    1   13    1    7    3    3
4    1     6    0    0  950    0    4    2    0   19
5    6     1    0   11    2  859    5    1    3    4
6    5     3    0    0    3    3  944    0    0    0
7    0    21    5    0    1    0    0  991    0   10
8    8     2    4   16    8   11    3    4  914    4
9    4     5    2    8    9    2    1    8    2  968

Error rate for individual digits¶

0 - 0.006122
1 - 0.001762
2 - 0.034884
3 - 0.033663
4 - 0.032587
5 - 0.036996
6 - 0.014614
7 - 0.035992
8 - 0.061602
9 - 0.040634

- We improved a bit
- After implementing the 3NN we get a better error rate of 2.95 %
- Error is decresed little in 2,3,4,5,7,8 and 9 highest among them is still 8, where error increased
- there is still some misprediction between 7 with 1; 8 and 5 with 3; 9 with 4; and 7 with 2

Importing data

import os
import gzip
import math
import operator
import sklearn.model_selection
import random

import matplotlib.pyplot as matplot
import matplotlib
%matplotlib inline
import pandas as pd
import numpy as np
import pickle as cPickle

from time import time
from itertools import chain
from collections import Counter
from PIL import Image
from scipy.stats import mode
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data/')

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz

train = mnist.train.images
validation = mnist.validation.images
test = mnist.test.images

trlab = mnist.train.labels
vallab = mnist.validation.labels
tslab = mnist.test.labels

train = np.concatenate((train, validation), axis=0)
trlab = np.concatenate((trlab, vallab), axis=0)

KNN function

def knn(train, train_label, test, test_label, k):
    pred = []
    for w in range(len(test)):
        test_1 = test[w]
        diff = (train - test_1)
        dist = np.einsum('ij, ij->i', diff, diff)  #distance measure
        nearest_lbs = train_label[np.argsort(dist)[:k]]
        major = mode(nearest_lbs)[0][0]
        pred.append(major)

    cm = pd.DataFrame(confusion_matrix(test_label, pred))
    err = 1-accuracy_score(test_label, pred)
    return [err,cm,pred]

Defining the error rate for the kNN to display in a tabulated format

def error_rate(confusion_matrix):
    a = confusion_matrix
    b = a.sum(axis=1)
    df = []
    for i in range(0,10):
        temp = 1-a[i][i]/b[i]
        df.append(temp)
    
    df = pd.DataFrame(df)
    df.columns = ['Error rate']
    return df

KNN with Leave-one-out Cross validation

def knncv(data, label, klist):
    df = pd.DataFrame(index=range(len(label)), columns=range(len(klist)))
    for p in range(len(label)):
        te = data[p]
        te_lb = label[p]
        tr = np.delete(data, p, 0)
        train_label = np.delete(label, p)

        diff = (tr - te)
        dis = np.einsum('ij, ij->i', diff, diff) 
        for i, k in enumerate(klist):
            near = train_label[np.argsort(dis)[:k]]
            pick = mode(near)[0][0]
            if pick == te_lb:
                df.iloc[p][i] = 0
            else:
                df.iloc[p][i] = 1

    return df

Generating random sample of the data¶

#generating a random sequence for sampling
seq = np.random.randint(0,60000,6000)
train_samp = train[seq]
trlab_samp = trlab[seq]

train_samp.shape
trlab_samp.shape

(6000,)

Comparing sampled data distribution with the original¶

fig, ax = matplot.subplots(1,2, figsize=(15,7))
ax[0].hist(trlab_samp)
ax[1].hist(trlab)
fig.show
matplot.show()

Downsampling¶

Defining the downsampling function¶

def downsamples(n, data):
    rn = range(0,784,n)
    data = data[:, rn]
    return data

N=4 down sampling¶

q3tr = downsamples(4, train_samp)
q3tr_full = downsamples(4, train)
q3ts = downsamples(4, test)

q3n4 = knncv(q3tr, trlab_samp, range(1,21))

matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q3n4, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for downsampling n=4)')
matplot.show()

Best k: k=1¶

st = time()
q3a = knn(q3tr_full, trlab, q3ts, tslab, 1)
end = time() - st
t3a = end/len(tslab) # Query time

print(q3a[0]) #Error rate

0.074

q3a[1] #Confusion matrix

error_rate(q3a[1]) #Individual digit errors

N=7 down sampling¶

q3btr = downsamples(7, train_samp)
q3btr_full = downsamples(7, train)
q3bts = downsamples(7, test)

q3n7 = knncv(q3btr, trlab_samp, range(1,21))

matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q3n7, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for downsampling n=7)')
matplot.show()

Best k: k = 5¶

st = time()
q3b = knn(q3btr_full, trlab, q3bts, tslab, 5)
end = time() - st
t3b=end/len(tslab) # Query time

print(q3b[0]) #Error rate

0.1535

q3b[1] #Confusion Matrix

error_rate(q3b[1]) #Individual digit errors

N = 14 downsampling¶

q3ctr = downsamples(14, train_samp)
q3ctr_full = downsamples(14, train)
q3cts = downsamples(14, test)

q3n14 = knncv(q3ctr, trlab_samp, range(1,21))

matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q3n14, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for downsampling n=14)')
matplot.show()

best k: k = 9¶

st = time()
q3c = knn(q3ctr_full, trlab, q3cts, tslab, 9)
end = time() - st
t3c = end/len(tslab) # Query time

print(q3c[0]) #Error rate

0.2616

q3c[1] #Confusion Matrix

error_rate(q3c[1]) #Individual digit errors

N = 2 downsampling¶

q3dtr = downsamples(2, train_samp)
q3dtr_full = downsamples(2, train)
q3dts = downsamples(2, test)

q3n2 = knncv(q3dtr, trlab_samp, range(1,21))

matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q3n2, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for downsampling n=2)')
matplot.show()

best k: k = 1¶

st = time()
q3d = knn(q3dtr_full, trlab, q3dts, tslab, 1)
end = time() - st
t3d = end/len(tslab) # Query time

print(q3d[0]) #Error rate

0.0359

q3d[1] #Confusion Matrix

error_rate(q3d[1])

n = [2,4,7,14]
err = [q3d[0],q3a[0],q3b[0],q3c[0]]
t = [t3d,t3a,t3b,t3c]

matplot.subplots(figsize=(20, 10))
matplot.plot(n, err, label="Error") # Blue
matplot.plot(n, t, label="Query time in s") # Orange
matplot.xticks(n,n)
matplot.grid(True)
matplot.legend()
matplot.title('N vs Error (Blue) & N vs Query time in s (Orange)')
matplot.show()

n = [2,4,7,14]
k_val = [1,1,5,9]

matplot.subplots(figsize=(10, 5))
matplot.plot(n, k_val, label="K")
matplot.xticks(n,n)
matplot.legend()
matplot.grid(True)
matplot.title('N vs K-value')
matplot.show()

The query time decreases as the downsampling factor N increases
As N increases, K increases as well
But also, as N increases, the Error rate increases
Between N = 2 to N = 4 we have a sweet spot between K, Query time and Error Rate

Smart Down Sampling¶

def downsample2(n, data):
    res = []
    for i in range(len(data)):
        temp = []
        sample = np.reshape(data[i], (28,28))
        row = np.asarray(np.split(sample, int(28 / n), axis=0))
        for j in range(int(28 / n)):
            col = np.asarray(np.split(row[j], int(28 / n), axis=1))
            add = np.asarray([np.sum(x) for x in col])
            temp.append(np.ndarray.tolist(add))

        res.append(list(chain.from_iterable(temp)))

    ndim = int(28 / n) * int(28 / n)
    data = np.asarray([np.reshape(y, (ndim)) for y in res])
    return data

N = 4 smart sampling¶

q4atr = downsample2(4, train_samp)
q4atr_full = downsample2(4, train)
q4ats = downsample2(4, test)

Smartsampled image

x1 = np.reshape(q4atr_full[0], [7,7])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()

q4n4 = knncv(q4atr, trlab_samp, range(1,21))

matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q4n4, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for smartsampling n=4)')
matplot.show()

best k: k = 1¶

st = time()
q4a = knn(q4atr_full, trlab, q4ats, tslab, 1)
end = time() - st
t4a = end/len(tslab) # Query time

print(q4a[0]) # Error rate

0.0541

q4a[1] # Confusion matrix

error_rate(q4a[1])

N = 7 smart sampling¶

q4btr = downsample2(7, train_samp)
q4btr_full = downsample2(7, train)
q4bts = downsample2(7, test)

x1 = np.reshape(q4btr_full[0], [4,4])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()

q4n7 = knncv(q4btr, trlab_samp, range(1,21))

matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q4n7, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for smartsampling n=7)')
matplot.show()

best k: k = 7¶

st = time()
q4b = knn(q4btr_full, trlab, q4bts, tslab, 7)
end = time() - st
t4b = end/len(tslab) # Query time

print(q4b[0]) # Error rate

0.1744

q4b[1] # Confusion matrix

error_rate(q4b[1])

N = 14 super sampling¶

q4ctr = downsample2(14, train_samp)
q4ctr_full = downsample2(14, train)
q4cts = downsample2(14, test)

x1 = np.reshape(q4ctr_full[0], [2,2])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()

q4n14 = knncv(q4ctr, trlab_samp, range(1,21))

matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q4n14, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for smartsampling n=14)')
matplot.show()

best k: k = 20¶

st = time()
q4c = knn(q4ctr_full, trlab, q4cts, tslab, 20)
end = time() - st
t4c = end/len(tslab) # Query time

print(q4c[0]) # Error rate

0.493

q4c[1] # Confusion matrix

error_rate(q4c[1])

N = 2 smart sampling¶

q4dtr = downsample2(2, train_samp)
q4dtr_full = downsample2(2, train)
q4dts = downsample2(2, test)

x1 = np.reshape(q4dtr_full[0], [14,14])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()

q4n2 = knncv(q4dtr, trlab_samp, range(1,21))

matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q4n2, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for smartsampling n=2)')
matplot.show()

best k: k = 1¶

st = time()
q4d = knn(q4dtr_full, trlab, q4dts, tslab, 1)
end = time() - st
t4d = end/len(tslab) # Query time

print(q4d[0]) # Error rate

0.0283

q4d[1] # Confusion matrix

error_rate(q4d[1])

n = [2,4,7,14]
err = [q4d[0],q4a[0],q4b[0],q4c[0]]
t = [t4d,t4a,t4b,t4c]

matplot.subplots(figsize=(20, 10))
matplot.plot(n, err, label="Error") # Blue
matplot.plot(n, t, label="Query time in s") # Orange
matplot.xticks(n,n)
matplot.grid(True)
matplot.legend()
matplot.title('N vs Error (Blue) & N vs Query time in s (Orange)')
matplot.show()

n = [2,4,7,14]
k_val = [1,1,7,20]

matplot.subplots(figsize=(10, 5))
matplot.plot(n, k_val, label="K")
matplot.xticks(n,n)
matplot.yticks(k_val, k_val)
matplot.legend()
matplot.grid(True)
matplot.title('N vs K-value')
matplot.show()

The query time decreases as the smartsampling factor N increases
But, as N increases, K increases as well
Also, as N increases, the Error rate increases
Between N = 2 to N = 4 we have a sweet spot between K, Query time and Error Rate
Note that, k=1 for N=2 and N=4

N=28 smart sampling¶

q5tr = downsample2(28, train_samp)
q5tr_full = downsample2(28, train)
q5ts = downsample2(28, test)

x1 = np.reshape(q5tr_full[0], [1,1])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()

q5k = knncv(q5tr, trlab_samp, range(1,21))

matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q5k, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (for smartsampling n=28)')
matplot.show()

k = 1 has the better accuracy¶

st = time()
q5 = knn(q5tr_full, trlab, q5ts, tslab, 1)
end = time() - st
print(end/len(tslab)) # Query time

0.004072939896583557

print(q5[0]) # Error rate

0.8324

q5[1] # Confusion matrix

error_rate(q5[1])

When we effectively reduce the size down to only 1 pixel, it takes a heavy toll on the accuracy, even though the query time decreases significantly

Out of all the different downsampling methods I would use SmartSampling with N=2 and K=1¶

It has the best blend of query time and accuracy (2.83 %)

Importing data

import os
import gzip
import math
import operator
import sklearn.model_selection
import random

import matplotlib.pyplot as matplot
import matplotlib
%matplotlib inline
import pandas as pd
import numpy as np
import pickle as cPickle

from time import time
from itertools import chain
from collections import Counter
from PIL import Image
from scipy.stats import mode
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data/')

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz

train = mnist.train.images
validation = mnist.validation.images
test = mnist.test.images

trlab = mnist.train.labels
vallab = mnist.validation.labels
tslab = mnist.test.labels

train = np.concatenate((train, validation), axis=0)
trlab = np.concatenate((trlab, vallab), axis=0)

KNN function

def knn(train, train_label, test, test_label, k):
    pred = []
    for w in range(len(test)):
        test_1 = test[w]
        diff = (train - test_1)
        dist = np.einsum('ij, ij->i', diff, diff)  #distance measure
        nearest_lbs = train_label[np.argsort(dist)[:k]]
        major = mode(nearest_lbs)[0][0]
        pred.append(major)

    cm = pd.DataFrame(confusion_matrix(test_label, pred))
    err = 1-accuracy_score(test_label, pred)
    return [err,cm,pred]

Defining the error rate for the kNN to display in a tabulated format

def error_rate(confusion_matrix):
    a = confusion_matrix
    b = a.sum(axis=1)
    df = []
    for i in range(0,10):
        temp = 1-a[i][i]/b[i]
        df.append(temp)
    
    df = pd.DataFrame(df)
    df.columns = ['Error rate']
    return df

KNN with Leave-one-out Cross validation

def knncv(data, label, klist):
    df = pd.DataFrame(index=range(len(label)), columns=range(len(klist)))
    for p in range(len(label)):
        te = data[p]
        te_lb = label[p]
        tr = np.delete(data, p, 0)
        train_label = np.delete(label, p)

        diff = (tr - te)
        dis = np.einsum('ij, ij->i', diff, diff) 
        for i, k in enumerate(klist):
            near = train_label[np.argsort(dis)[:k]]
            pick = mode(near)[0][0]
            if pick == te_lb:
                df.iloc[p][i] = 0
            else:
                df.iloc[p][i] = 1

    return df

Generating random sample of the data

#generating a random sequence for sampling
seq = np.random.randint(0,60000,6000)
train_samp = train[seq]
trlab_samp = trlab[seq]

train_samp.shape
trlab_samp.shape

(6000,)

seq = np.random.randint(0,10000,1000)
ts_samp = test[seq]
tslab_samp = tslab[seq]

Comparing sampled data distribution with the original

fig, ax = matplot.subplots(1,2, figsize=(10,5))
ax[0].hist(trlab_samp)
ax[1].hist(trlab)
fig.show
matplot.show()

Comparing test samples as well

fig, ax = matplot.subplots(1,2, figsize=(8,3))
ax[0].hist(tslab_samp)
ax[1].hist(tslab)
fig.show
matplot.show()

Convert greyscale to black and white¶

We see that all the images are in greyscale, we can put a threshold and convert the individual pixel values into 0 and 1, which will effectively render them black and white. Lets try with one image for example, and see the distribution

temp = train[:,]>0.75
matplot.hist(temp[0])
matplot.show()

Now seeing as the transformation works, we shall apply the transformation to the whole training and testing set. thing to note is the black and white image is converted float (though only 0 and 1) as the original data was float.

Black and white function¶

def bnw(threshold, data):
    newdata = data[:,]>threshold
    return newdata

case (a): threshold = 0.75¶

q6tr_full = bnw(0.75, train)
q6ts = bnw(0.75, ts_samp)
q6tr = bnw(0.75, train_samp)

x1 = np.reshape(q6tr_full[0], [28,28])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()

q6k = knncv(q6tr, trlab_samp, range(1,21))
print(np.mean(q6k, axis = 0))

0     0.813833
1     0.855167
2     0.866167
3     0.868667
4     0.882667
5     0.882833
6     0.886667
7     0.882500
8     0.880000
9     0.883000
10    0.887333
11    0.895333
12    0.896500
13    0.893833
14    0.890667
15    0.893167
16    0.896500
17    0.899333
18    0.900000
19    0.900000
dtype: float64

matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q6k, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (b n w threshold = 0.75)')
matplot.show()

best k: k = 1¶

st = time()
q6a = knn(q6tr, trlab_samp, q6ts, tslab_samp, 1)
end = time() - st
t6a = end/len(tslab_samp) # Query time

print(q6a[0]) #Error rate

0.916

q6a[1] #Confusion matrix

error_rate(q6a[1])

case (b): threshold = 0.40¶

q6tr_full = bnw(0.40, train)
q6ts = bnw(0.40, ts_samp)
q6tr = bnw(0.40, train_samp)

x1 = np.reshape(q6tr_full[0], [28,28])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()

q6b1 = knncv(q6tr, trlab_samp, range(1,21))
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q6b1, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (b n w threshold = 0.40)')
matplot.show()

best k: k=1¶

st = time()
q6b = knn(q6tr, trlab_samp, q6ts, tslab_samp, 1)
end = time() - st
t6b = end/len(tslab_samp) # Query time

print(q6b[0]) #Error rate

0.916

q6b[1] #Confusion matrix

error_rate(q6b[1])

case (c): threshold = 0.50¶

q6tr_full = bnw(0.50, train)
q6ts = bnw(0.50, ts_samp)
q6tr = bnw(0.50, train_samp)

x1 = np.reshape(q6tr_full[0], [28,28])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()

q6c1 = knncv(q6tr, trlab_samp, range(1,21))
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q6c1, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (b n w threshold = 0.50)')
matplot.show()

best k: k=1¶

st = time()
q6c = knn(q6tr, trlab_samp, q6ts, tslab_samp, 1)
end = time() - st
t6c = end/len(tslab_samp) # Query time

print(q6c[0]) #Error rate

0.916

q6c[1] #Confusion matrix

error_rate(q6c[1])

case (d): threshold = 0.60¶

q6tr_full = bnw(0.60, train)
q6ts = bnw(0.60, ts_samp)
q6tr = bnw(0.60, train_samp)

x1 = np.reshape(q6tr_full[0], [28,28])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()

q6d1 = knncv(q6tr, trlab_samp, range(1,21))
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q6d1, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (b n w threshold = 0.60)')
matplot.show()

best k: k=1¶

st = time()
q6d = knn(q6tr, trlab_samp, q6ts, tslab_samp, 1)
end = time() - st
t6d = end/len(tslab_samp) # Query time

print(q6d[0]) #Error rate

0.916

q6d[1] #Confusion matrix

error_rate(q6d[1])

case (e): threshold = 0.90¶

q6tr_full = bnw(0.90, train)
q6ts = bnw(0.90, ts_samp)
q6tr = bnw(0.90, train_samp)

x1 = np.reshape(q6tr_full[0], [28,28])
matplot.imshow(x1, cmap='Greys_r')
matplot.show()

q6e1 = knncv(q6tr, trlab_samp, range(1,21))
matplot.subplots(figsize=(20, 10))
matplot.plot(np.mean(q6e1, axis = 0))
matplot.xticks(range(0,20), range(1,21))
matplot.grid(True)
matplot.title('Error rate vs K value (b n w threshold = 0.50)')
matplot.show()

best k: k=1¶

st = time()
q6e = knn(q6tr, trlab_samp, q6ts, tslab_samp, 1)
end = time() - st
t6e = end/len(tslab_samp) # Query time

print(q6e[0]) #Error rate

0.914

q6e[1] #Confusion matrix

error_rate(q6e[1])

thres = [0.40,0.50,0.60,0.75,0.90]
err = [q6b[0],q6c[0],q6d[0],q6a[0],q6e[0]]
t = [t6b,t6c,t6d,t6a,t6e]

matplot.subplots(figsize=(20, 10))
matplot.plot(thres, err, label="Error") # Blue
matplot.plot(thres, t, label="Query time in s") # Orange
matplot.xticks(thres,thres)
matplot.grid(True)
matplot.legend()
matplot.title('Threshold vs Error (Blue) & Threshold vs Query time in s (Orange)')
matplot.show()

Very wierd results
Everything is classified as 5
No matter the Threshold, Error and Query time (with this method) remains constant
Query time is no better than previous downsampling techniques
Also to note that K=1 for all the thresholds
Striclty recomend not using this technique and develop further (better) downsampling techniques

	Error rate
0	0.019388
1	0.015859
2	0.076550
3	0.094059
4	0.089613
5	0.116592
6	0.033403
7	0.071012
8	0.142710
9	0.092170

	Error rate
0	0.024490
1	0.014978
2	0.202519
3	0.203960
4	0.160896
5	0.265695
6	0.120042
7	0.112840
8	0.302875
9	0.156591

	Error rate
0	0.068367
1	0.043172
2	0.345930
3	0.300990
4	0.253564
5	0.455157
6	0.321503
7	0.182879
8	0.439425
9	0.257681

	Error rate
0	0.007143
1	0.002643
2	0.039729
3	0.050495
4	0.046843
5	0.043722
6	0.020877
7	0.038911
8	0.066735
9	0.046581

	Error rate
0	0.010204
1	0.007048
2	0.048450
3	0.095050
4	0.077393
5	0.081839
6	0.028184
7	0.061284
8	0.065708
9	0.073340

	0	1	2	3	4	5	6	7	8	9
0	961	5	0	0	0	4	7	1	1	1
1	1	1117	2	0	7	0	5	3	0	0
2	15	12	953	13	5	1	3	15	14	1
3	3	9	7	915	1	25	3	11	25	11
4	0	23	4	1	894	1	6	4	2	47
5	7	8	1	30	5	788	16	4	19	14
6	5	9	0	0	4	13	926	0	1	0
7	1	28	8	0	14	0	1	955	1	20
8	9	14	7	43	12	22	2	12	835	18
9	4	13	2	13	31	5	1	21	3	916

	0	1	2	3	4	5	6	7	8	9
0	956	1	1	5	1	3	9	3	0	1
1	1	1118	3	2	2	2	2	2	0	3
2	28	12	823	32	16	11	45	24	38	3
3	16	3	16	804	1	39	23	9	84	15
4	1	42	7	1	824	8	17	18	9	55
5	21	5	9	75	13	655	29	23	42	20
6	28	16	18	8	19	14	843	6	3	3
7	4	19	14	1	15	4	6	912	10	43
8	7	17	14	126	17	52	13	19	679	30
9	9	15	4	7	31	18	2	53	19	851

	0	1	2	3	4	5	6	7	8	9
0	913	1	9	10	2	5	16	19	4	1
1	2	1086	8	2	17	2	5	2	9	2
2	18	26	675	56	20	33	85	34	56	29
3	36	1	31	706	2	84	25	4	111	10
4	3	67	35	1	733	11	27	34	13	58
5	46	9	16	116	17	486	63	64	38	37
6	50	13	95	20	44	60	650	15	7	4
7	20	16	41	3	15	18	8	840	18	49
8	4	22	48	201	14	46	17	22	546	54
9	14	11	21	17	28	45	6	72	46	749

	0	1	2	3	4	5	6	7	8	9
0	973	1	1	0	0	1	3	1	0	0
1	0	1132	2	1	0	0	0	0	0	0
2	7	5	991	6	1	0	3	16	3	0
3	0	2	2	959	1	26	0	9	6	5
4	1	5	0	0	936	0	3	3	2	32
5	1	1	0	15	2	853	7	2	5	6
6	6	4	0	0	4	5	938	0	1	0
7	0	18	7	2	4	0	0	988	0	9
8	4	2	4	19	6	13	3	4	909	10
9	1	4	1	7	13	4	1	12	4	962

	0	1	2	3	4	5	6	7	8	9
0	970	1	0	0	1	2	3	1	1	1
1	0	1127	0	0	0	0	3	1	3	1
2	1	2	982	5	2	0	4	21	13	2
3	1	1	8	914	0	31	0	11	31	13
4	1	0	0	0	906	0	13	5	4	53
5	4	1	2	33	1	819	12	4	9	7
6	7	3	2	0	5	8	931	0	1	1
7	0	8	12	4	5	0	0	965	3	31
8	6	3	4	13	7	14	3	8	910	6
9	3	7	0	11	23	3	1	19	7	935

	0	1	2	3	4	5	6	7	8	9
0	798	3	6	5	8	16	15	1	124	4
1	5	1098	1	4	3	1	5	3	10	5
2	23	7	876	46	11	8	25	7	26	3
3	11	15	32	828	2	26	2	34	40	20
4	6	7	11	0	755	17	16	13	2	155
5	19	4	7	39	15	707	17	18	52	14
6	12	10	5	2	14	6	899	0	10	0
7	1	29	10	12	35	7	1	846	2	85
8	140	24	5	23	8	45	8	12	686	23
9	12	16	5	14	108	8	3	69	11	763

	Error rate
0	0.185714
1	0.032599
2	0.151163
3	0.180198
4	0.231161
5	0.207399
6	0.061587
7	0.177043
8	0.295688
9	0.243806

	0	1	2	3	4	5	6	7	8	9
0	698	3	26	33	24	54	20	13	102	7
1	4	999	4	31	22	21	3	8	12	31
2	128	21	489	57	80	27	153	5	37	35
3	84	32	46	416	55	107	21	70	55	124
4	47	25	95	46	410	42	92	38	32	155
5	136	31	21	113	69	281	29	84	88	40
6	58	10	189	11	96	23	542	0	12	17
7	20	49	15	79	29	39	9	665	52	71
8	324	18	13	143	32	109	18	37	231	49
9	61	20	16	151	108	37	17	189	71	339

	Error rate
0	0.287755
1	0.119824
2	0.526163
3	0.588119
4	0.582485
5	0.684978
6	0.434238
7	0.353113
8	0.762834
9	0.664024

	0	1	2	3	4	5	6	7	8	9
0	971	1	2	0	0	2	2	1	1	0
1	0	1131	2	0	1	1	0	0	0	0
2	7	1	1003	2	2	0	2	10	5	0
3	1	0	3	966	1	19	0	5	9	6
4	0	1	0	0	949	0	3	3	2	24
5	1	1	0	8	1	865	5	1	7	3
6	5	2	0	1	2	4	944	0	0	0
7	0	12	6	2	6	0	0	990	1	11
8	2	1	4	13	4	10	4	4	929	3
9	1	5	1	6	9	5	1	10	2	969

	Error rate
0	0.009184
1	0.003524
2	0.028101
3	0.043564
4	0.033605
5	0.030269
6	0.014614
7	0.036965
8	0.046201
9	0.039643

	0	1	2	3	4	5	6	7	8	9
0	224	9	130	110	58	92	100	60	123	74
1	15	533	36	57	114	85	72	114	27	82
2	131	38	131	123	93	100	101	113	121	81
3	106	50	128	121	102	91	102	106	115	89
4	64	81	99	92	128	89	105	130	81	113
5	73	52	101	109	96	85	93	98	90	95
6	123	48	108	114	98	85	85	101	100	96
7	65	110	81	107	123	106	97	140	86	113
8	148	17	115	124	89	75	119	77	120	90
9	69	60	99	109	103	101	140	114	105	109

	Error rate
0	0.771429
1	0.530396
2	0.873062
3	0.880198
4	0.869654
5	0.904709
6	0.911273
7	0.863813
8	0.876797
9	0.891972

	0	1	2	3	4	5	6	7	8	9
0	0	0	0	0	0	87	0	0	0	0
1	0	0	0	0	0	116	0	0	0	0
2	0	0	0	0	0	100	0	0	0	0
3	0	0	0	0	0	115	0	0	0	0
4	0	0	0	0	0	99	0	0	0	0
5	0	0	0	0	0	84	0	0	0	0
6	0	0	0	0	0	101	0	0	0	0
7	0	0	0	0	0	103	0	0	0	0
8	0	0	0	0	0	96	0	0	0	0
9	0	0	0	0	0	99	0	0	0	0

	0	1	2	3	4	5	6	7	8	9
0	698	3	26	33	24	54	20	13	102	7
1	4	999	4	31	22	21	3	8	12	31
2	128	21	489	57	80	27	153	5	37	35
3	84	32	46	416	55	107	21	70	55	124
4	47	25	95	46	410	42	92	38	32	155
5	136	31	21	113	69	281	29	84	88	40
6	58	10	189	11	96	23	542	0	12	17
7	20	49	15	79	29	39	9	665	52	71
8	324	18	13	143	32	109	18	37	231	49
9	61	20	16	151	108	37	17	189	71	339

	0	1	2	3	4	5	6	7	8	9
0	224	9	130	110	58	92	100	60	123	74
1	15	533	36	57	114	85	72	114	27	82
2	131	38	131	123	93	100	101	113	121	81
3	106	50	128	121	102	91	102	106	115	89
4	64	81	99	92	128	89	105	130	81	113
5	73	52	101	109	96	85	93	98	90	95
6	123	48	108	114	98	85	85	101	100	96
7	65	110	81	107	123	106	97	140	86	113
8	148	17	115	124	89	75	119	77	120	90
9	69	60	99	109	103	101	140	114	105	109

	0	1	2	3	4	5	6	7	8	9
0	0	0	0	0	0	87	0	0	0	0
1	0	0	0	0	0	116	0	0	0	0
2	0	0	0	0	0	100	0	0	0	0
3	0	0	0	0	0	115	0	0	0	0
4	0	0	0	0	0	99	0	0	0	0
5	0	0	0	0	0	84	0	0	0	0
6	0	0	0	0	0	101	0	0	0	0
7	0	0	0	0	0	103	0	0	0	0
8	0	0	0	0	0	96	0	0	0	0
9	0	0	0	0	0	99	0	0	0	0

	0	1	2	3	4	5	6	7	8	9
0	0	0	0	0	0	87	0	0	0	0
1	0	0	0	0	0	116	0	0	0	0
2	0	0	0	0	0	100	0	0	0	0
3	0	0	0	0	0	115	0	0	0	0
4	0	0	0	0	0	99	0	0	0	0
5	0	0	0	0	0	84	0	0	0	0
6	0	0	0	0	0	101	0	0	0	0
7	0	0	0	0	0	103	0	0	0	0
8	0	0	0	0	0	96	0	0	0	0
9	0	0	0	0	0	99	0	0	0	0

	0	1	2	3	4	5	6	7	8	9
0	0	0	0	0	0	87	0	0	0	0
1	0	0	0	0	0	116	0	0	0	0
2	0	0	0	0	0	100	0	0	0	0
3	0	0	0	0	0	115	0	0	0	0
4	0	0	0	0	0	99	0	0	0	0
5	0	0	0	0	0	84	0	0	0	0
6	0	0	0	0	0	101	0	0	0	0
7	0	0	0	0	0	103	0	0	0	0
8	0	0	0	0	0	96	0	0	0	0
9	0	0	0	0	0	99	0	0	0	0

	0	1	2	3	4	5	6	7	8	9
0	0	0	0	0	0	87	0	0	0	0
1	0	0	0	0	0	116	0	0	0	0
2	0	0	0	0	0	100	0	0	0	0
3	0	0	0	0	0	115	0	0	0	0
4	0	0	0	0	0	99	0	0	0	0
5	0	0	0	0	0	84	0	0	0	0
6	0	0	0	0	0	101	0	0	0	0
7	0	0	0	0	0	103	0	0	0	0
8	0	0	0	0	0	96	0	0	0	0
9	0	0	0	0	0	99	0	0	0	0

	1	5
0	0	87
1	2	114
2	0	100
3	0	115
4	0	99
5	0	84
6	0	101
7	0	103
8	0	96
9	0	99

	Error rate
0	1.000000
1	0.982759
2	1.000000
3	1.000000
4	1.000000
5	0.000000
6	1.000000
7	1.000000
8	1.000000
9	1.000000

	0	1	2	3	4	5	6	7	8	9
0	698	3	26	33	24	54	20	13	102	7
1	4	999	4	31	22	21	3	8	12	31
2	128	21	489	57	80	27	153	5	37	35
3	84	32	46	416	55	107	21	70	55	124
4	47	25	95	46	410	42	92	38	32	155
5	136	31	21	113	69	281	29	84	88	40
6	58	10	189	11	96	23	542	0	12	17
7	20	49	15	79	29	39	9	665	52	71
8	324	18	13	143	32	109	18	37	231	49
9	61	20	16	151	108	37	17	189	71	339

	0	1	2	3	4	5	6	7	8	9
0	224	9	130	110	58	92	100	60	123	74
1	15	533	36	57	114	85	72	114	27	82
2	131	38	131	123	93	100	101	113	121	81
3	106	50	128	121	102	91	102	106	115	89
4	64	81	99	92	128	89	105	130	81	113
5	73	52	101	109	96	85	93	98	90	95
6	123	48	108	114	98	85	85	101	100	96
7	65	110	81	107	123	106	97	140	86	113
8	148	17	115	124	89	75	119	77	120	90
9	69	60	99	109	103	101	140	114	105	109

	0	1	2	3	4	5	6	7	8	9
0	0	0	0	0	0	87	0	0	0	0
1	0	0	0	0	0	116	0	0	0	0
2	0	0	0	0	0	100	0	0	0	0
3	0	0	0	0	0	115	0	0	0	0
4	0	0	0	0	0	99	0	0	0	0
5	0	0	0	0	0	84	0	0	0	0
6	0	0	0	0	0	101	0	0	0	0
7	0	0	0	0	0	103	0	0	0	0
8	0	0	0	0	0	96	0	0	0	0
9	0	0	0	0	0	99	0	0	0	0

	0	1	2	3	4	5	6	7	8	9
0	0	0	0	0	0	87	0	0	0	0
1	0	0	0	0	0	116	0	0	0	0
2	0	0	0	0	0	100	0	0	0	0
3	0	0	0	0	0	115	0	0	0	0
4	0	0	0	0	0	99	0	0	0	0
5	0	0	0	0	0	84	0	0	0	0
6	0	0	0	0	0	101	0	0	0	0
7	0	0	0	0	0	103	0	0	0	0
8	0	0	0	0	0	96	0	0	0	0
9	0	0	0	0	0	99	0	0	0	0

	0	1	2	3	4	5	6	7	8	9
0	0	0	0	0	0	87	0	0	0	0
1	0	0	0	0	0	116	0	0	0	0
2	0	0	0	0	0	100	0	0	0	0
3	0	0	0	0	0	115	0	0	0	0
4	0	0	0	0	0	99	0	0	0	0
5	0	0	0	0	0	84	0	0	0	0
6	0	0	0	0	0	101	0	0	0	0
7	0	0	0	0	0	103	0	0	0	0
8	0	0	0	0	0	96	0	0	0	0
9	0	0	0	0	0	99	0	0	0	0

	0	1	2	3	4	5	6	7	8	9
0	0	0	0	0	0	87	0	0	0	0
1	0	0	0	0	0	116	0	0	0	0
2	0	0	0	0	0	100	0	0	0	0
3	0	0	0	0	0	115	0	0	0	0
4	0	0	0	0	0	99	0	0	0	0
5	0	0	0	0	0	84	0	0	0	0
6	0	0	0	0	0	101	0	0	0	0
7	0	0	0	0	0	103	0	0	0	0
8	0	0	0	0	0	96	0	0	0	0
9	0	0	0	0	0	99	0	0	0	0

	1	5
0	0	87
1	2	114
2	0	100
3	0	115
4	0	99
5	0	84
6	0	101
7	0	103
8	0	96
9	0	99