K-Nearest Neighbors#

References#

Dataset Setup#

[5]:

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_classification

plt.style.use('seaborn')

%matplotlib inline

[6]:

n_samples = 100
X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, n_classes=2, class_sep=2,
                           n_clusters_per_class=1, random_state=888)

X.shape, y.shape

[6]:

((100, 2), (100,))

[7]:

fig = plt.figure(figsize=(5,5))
ax1 = fig.add_subplot(1, 1, 1)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, style=y, ax=ax1, palette='deep', s=100)
plt.show()

../_images/NearestNeighbors_explore_4_0.png

Assumption#

similar points have similar labels

Algorithm#

\begin{align*} & \text{Test Point } x\\ & \text{k nearest neighbors }S_x \subseteq D \text{, such that } | S_x | = k\\ \\ & \forall (x',y') \in D \backslash S_x \implies \text{ part of dataset but not in nearest neighbors}\\ & \text{distance } (x, x') \ge {max\atop{(x'', y'') \in S_x}} dist(x, x'') \end{align*}

In essence points that are not in k nearest neighbors than they must be farther than the test data points compared to the maximum distance from k nearest neighbors
labels of k nearest neighbors idenitfies test point’s labels.
knn is as good as the distance metric it uses.

[8]:

from scipy.spatial import minkowski_distance

[13]:

def k_nearest_neighbors(X, test_point, k):

    ## using minkowski distance with p2 (basically euclidean)
    distances = minkowski_distance(X, test_point, p=2)

    ## getting closest(lowest distance) point's indeces
    nearest_neighbors_indexes = np.argpartition(distances, k)[:k]
    return nearest_neighbors_indexes


def plot_neighbors(X, y, test_point, ax):
    sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, style=y, ax=ax, palette='deep', s=100)

    nearest_neighbors_indexes = k_nearest_neighbors(X, test_point, k = 7)

    sns.scatterplot(x=test_point[[0]], y=test_point[[1]], color='r', ax=ax, palette='deep', marker='^',
                    s=150, label='test-point')
    sns.scatterplot(x=X[nearest_neighbors_indexes,0], y=X[nearest_neighbors_indexes,1], color='k', ax=ax,
                    palette='deep', s=100, alpha=0.5, label='nearest neighbors')

    ax.legend(bbox_to_anchor=(1, 1.3), loc='upper right')

learn more about distances here - https://machinelearningexploration.readthedocs.io/en/latest/MathExploration/distances.html

Training#

[14]:

fig, ax = plt.subplots(1, 3, figsize=(13, 7))

test_point1 = np.array([1.5, -1.5])
ax[0].set_title(f'test point 1 : {test_point1}')

plot_neighbors(X, y, test_point1, ax[0])

test_point2 = np.array([-2, -1])
ax[1].set_title(f'test point 2 : {test_point2}')

plot_neighbors(X, y, test_point2, ax[1])

test_point3 = np.array([0.5, -2])
ax[2].set_title(f'test point 3 : {test_point3}')
plot_neighbors(X, y, test_point3, ax[2])

plt.tight_layout()
plt.show()

../_images/NearestNeighbors_explore_12_0.png

In above plot#

test point 1 = (1.5, -1.5) resides in 1’s area dominently and all 7 nearest neighbors are indicating 1 as label.

[15]:

nearest_neighbors_indexes = k_nearest_neighbors(X, test_point1, k = 7)
test_point1, y[nearest_neighbors_indexes]

[15]:

(array([ 1.5, -1.5]), array([1, 1, 1, 1, 1, 1, 1]))

test point 2 = (-2.0,-1,0) resides in 0’s area dominently and all k nearest neighbors are indicating 0 as label.

[16]:

nearest_neighbors_indexes = k_nearest_neighbors(X, test_point2, k = 7)
test_point2, y[nearest_neighbors_indexes]

[16]:

(array([-2, -1]), array([0, 0, 0, 0, 0, 0, 0]))

test point 3 = (0.5, -2) resides in somewhat center of the plot and due to euclidean distance calculation algorithm 6 out of 7 nearest neighbors have label as 1.

[17]:

nearest_neighbors_indexes = k_nearest_neighbors(X, test_point3, k = 7)
test_point3, y[nearest_neighbors_indexes]

[17]:

(array([ 0.5, -2. ]), array([1, 1, 1, 0, 1, 1, 1]))

Prediction & Decision Boundary#

[23]:

def predict(X, y, test_point, k):

    nearest_neighbors_indexes = k_nearest_neighbors(X, test_point, k)
    pred_y = y[nearest_neighbors_indexes]

    labels, counts =  np.unique(pred_y, return_counts=True)
    total_possibilities = counts.sum()

    pred_probs = {}

    for i in zip(labels,counts):
        pred_probs[i[0]] = i[1]/ total_possibilities

    total_labels = np.unique(y)

    for label in total_labels:
        if label not in pred_probs:
            pred_probs[label] = 0
    return pred_probs

def prediction_boundary(X, y, k, ax):
    # Creating a meshgrid to generate predictions and decision boundary.
    xx, yy = np.meshgrid(np.linspace(X.min(), X.max(), 100),np.linspace(X.min() , X.max(), 100))
    mesh = np.c_[xx.ravel(), yy.ravel()]

    mesh_pred = np.array([predict(X, y, i, k=k).get(1) for i in mesh])
    zz = mesh_pred.reshape(xx.shape)

    sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, style=y, ax=ax, palette='deep')
    ax.contourf(xx, yy, zz, cmap=plt.cm.coolwarm, alpha=0.4)
    ax.set_title(f"k = {k}")

[27]:

fig, ax = plt.subplots(1, 4, figsize=(18, 5))

prediction_boundary(X, y, k=3, ax=ax[0])

prediction_boundary(X, y, k=5, ax=ax[1])

prediction_boundary(X, y, k=7, ax=ax[2])

prediction_boundary(X, y, k=10, ax=ax[3])

plt.legend()
plt.show()

../_images/NearestNeighbors_explore_22_0.png

With increasing number of nearest neighbors for prediction (k) the decision boundary widens more, which means more boundary cases will have less prediction probabilties closer to 1.

K-Nearest Neighbors

Contents