Introduction

Set-up Environment

python -m venv ./venv_utils
source venv_utils/bin/activate
pip install numpy pandas plotnine scikit-learn

Coding

import numpy as np
import pandas as pd
import plotnine as gg

from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=100, centers=3, n_features=2, random_state=1)
X = (X - X.mean(axis=0)) / X.std(axis=0)

X = pd.DataFrame(X, columns=["x1", "x2"])
X["cluster"] = ["0"] * X.shape[0]

def init_centers(X, k):
    indxs = np.random.randint(0, X.shape[0], k)
    return X.iloc[indxs, :]

np.random.seed(1)
centers = init_centers(X, 3)
centers.loc[:, "cluster"] = ["1", "2", "3"]

def find_cluster(X, centers):
    A = X.iloc[:, 0:2].to_numpy()
    B = centers.iloc[:, 0:2].to_numpy()
    distances = np.sqrt(((A - B[:, np.newaxis]) ** 2).sum(axis=2))
    return([["1", "2", "3"][i] for i in distances.argmin(axis=0)])

X.loc[:, "cluster"] = find_cluster(X, centers)

def update_centers(X):
    res = X.groupby("cluster").mean()
    res["cluster"] = res.index
    res.reset_index(drop=True, inplace=True)
    return res

centers = update_centers(X)

while True:
    old_clusters = X.cluster
    X.loc[:, "cluster"] = find_cluster(X, centers)
    centers = update_centers(X)
    if np.array_equal(X["cluster"], old_clusters):
        break

p = (
    gg.ggplot(data=X, mapping=gg.aes(x="x1", y="x2", color="cluster")) + 
    gg.geom_point() + 
    gg.geom_point(data=centers, mapping=gg.aes(x="x1", y="x2", color="cluster"), size=3) + 
    gg.coord_fixed() +
    gg.theme_minimal() + 
    gg.scale_color_manual(values=["#c20305", "#377eb8", "#4daf4a"])
)
p