This website works better with desktop in both themes, for mobile devices please change to light theme.

# K-Nearest Neighbors#

## Dataset Setup#

[5]:

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_classification

plt.style.use('seaborn')

%matplotlib inline

[6]:

n_samples = 100
X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, n_classes=2, class_sep=2,
n_clusters_per_class=1, random_state=888)

X.shape, y.shape

[6]:

((100, 2), (100,))

[7]:

fig = plt.figure(figsize=(5,5))
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, style=y, ax=ax1, palette='deep', s=100)
plt.show()


## Assumption#

• similar points have similar labels

## Algorithm#

\begin{align*} & \text{Test Point } x\\ & \text{k nearest neighbors }S_x \subseteq D \text{, such that } | S_x | = k\\ \\ & \forall (x',y') \in D \backslash S_x \implies \text{ part of dataset but not in nearest neighbors}\\ & \text{distance } (x, x') \ge {max\atop{(x'', y'') \in S_x}} dist(x, x'') \end{align*}

• In essence points that are not in k nearest neighbors than they must be farther than the test data points compared to the maximum distance from k nearest neighbors

• labels of k nearest neighbors idenitfies test point’s labels.

• knn is as good as the distance metric it uses.

[8]:

from scipy.spatial import minkowski_distance

[13]:

def k_nearest_neighbors(X, test_point, k):

## using minkowski distance with p2 (basically euclidean)
distances = minkowski_distance(X, test_point, p=2)

## getting closest(lowest distance) point's indeces
nearest_neighbors_indexes = np.argpartition(distances, k)[:k]
return nearest_neighbors_indexes

def plot_neighbors(X, y, test_point, ax):
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, style=y, ax=ax, palette='deep', s=100)

nearest_neighbors_indexes = k_nearest_neighbors(X, test_point, k = 7)

sns.scatterplot(x=test_point[[0]], y=test_point[[1]], color='r', ax=ax, palette='deep', marker='^',
s=150, label='test-point')
sns.scatterplot(x=X[nearest_neighbors_indexes,0], y=X[nearest_neighbors_indexes,1], color='k', ax=ax,
palette='deep', s=100, alpha=0.5, label='nearest neighbors')

ax.legend(bbox_to_anchor=(1, 1.3), loc='upper right')


## Training#

[14]:

fig, ax = plt.subplots(1, 3, figsize=(13, 7))

test_point1 = np.array([1.5, -1.5])
ax[0].set_title(f'test point 1 : {test_point1}')

plot_neighbors(X, y, test_point1, ax[0])

test_point2 = np.array([-2, -1])
ax[1].set_title(f'test point 2 : {test_point2}')

plot_neighbors(X, y, test_point2, ax[1])

test_point3 = np.array([0.5, -2])
ax[2].set_title(f'test point 3 : {test_point3}')
plot_neighbors(X, y, test_point3, ax[2])

plt.tight_layout()
plt.show()


### In above plot#

test point 1 = (1.5, -1.5) resides in 1’s area dominently and all 7 nearest neighbors are indicating 1 as label.

[15]:

nearest_neighbors_indexes = k_nearest_neighbors(X, test_point1, k = 7)
test_point1, y[nearest_neighbors_indexes]

[15]:

(array([ 1.5, -1.5]), array([1, 1, 1, 1, 1, 1, 1]))


test point 2 = (-2.0,-1,0) resides in 0’s area dominently and all k nearest neighbors are indicating 0 as label.

[16]:

nearest_neighbors_indexes = k_nearest_neighbors(X, test_point2, k = 7)
test_point2, y[nearest_neighbors_indexes]

[16]:

(array([-2, -1]), array([0, 0, 0, 0, 0, 0, 0]))


test point 3 = (0.5, -2) resides in somewhat center of the plot and due to euclidean distance calculation algorithm 6 out of 7 nearest neighbors have label as 1.

[17]:

nearest_neighbors_indexes = k_nearest_neighbors(X, test_point3, k = 7)
test_point3, y[nearest_neighbors_indexes]

[17]:

(array([ 0.5, -2. ]), array([1, 1, 1, 0, 1, 1, 1]))


## Prediction & Decision Boundary#

[23]:

def predict(X, y, test_point, k):

nearest_neighbors_indexes = k_nearest_neighbors(X, test_point, k)
pred_y = y[nearest_neighbors_indexes]

labels, counts =  np.unique(pred_y, return_counts=True)
total_possibilities = counts.sum()

pred_probs = {}

for i in zip(labels,counts):
pred_probs[i[0]] = i[1]/ total_possibilities

total_labels = np.unique(y)

for label in total_labels:
if label not in pred_probs:
pred_probs[label] = 0
return pred_probs

def prediction_boundary(X, y, k, ax):
# Creating a meshgrid to generate predictions and decision boundary.
xx, yy = np.meshgrid(np.linspace(X.min(), X.max(), 100),np.linspace(X.min() , X.max(), 100))
mesh = np.c_[xx.ravel(), yy.ravel()]

mesh_pred = np.array([predict(X, y, i, k=k).get(1) for i in mesh])
zz = mesh_pred.reshape(xx.shape)

sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, style=y, ax=ax, palette='deep')
ax.contourf(xx, yy, zz, cmap=plt.cm.coolwarm, alpha=0.4)
ax.set_title(f"k = {k}")

[27]:

fig, ax = plt.subplots(1, 4, figsize=(18, 5))

prediction_boundary(X, y, k=3, ax=ax[0])

prediction_boundary(X, y, k=5, ax=ax[1])

prediction_boundary(X, y, k=7, ax=ax[2])

prediction_boundary(X, y, k=10, ax=ax[3])

plt.legend()
plt.show()


With increasing number of nearest neighbors for prediction (k) the decision boundary widens more, which means more boundary cases will have less prediction probabilties closer to 1.