Introduction
Set-up Environment
python -m venv ./venv_utils
source venv_utils/bin/activate
pip install numpy pandas plotnine scikit-learn
Coding
import numpy as np
import pandas as pd
import plotnine as gg
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=100, centers=3, n_features=2, random_state=1)
X = (X - X.mean(axis=0)) / X.std(axis=0)
X = pd.DataFrame(X, columns=["x1", "x2"])
X["cluster"] = ["0"] * X.shape[0]
def init_centers(X, k):
indxs = np.random.randint(0, X.shape[0], k)
return X.iloc[indxs, :]
np.random.seed(1)
centers = init_centers(X, 3)
centers.loc[:, "cluster"] = ["1", "2", "3"]
def find_cluster(X, centers):
A = X.iloc[:, 0:2].to_numpy()
B = centers.iloc[:, 0:2].to_numpy()
distances = np.sqrt(((A - B[:, np.newaxis]) ** 2).sum(axis=2))
return([["1", "2", "3"][i] for i in distances.argmin(axis=0)])
X.loc[:, "cluster"] = find_cluster(X, centers)
def update_centers(X):
res = X.groupby("cluster").mean()
res["cluster"] = res.index
res.reset_index(drop=True, inplace=True)
return res
centers = update_centers(X)
while True:
old_clusters = X.cluster
X.loc[:, "cluster"] = find_cluster(X, centers)
centers = update_centers(X)
if np.array_equal(X["cluster"], old_clusters):
break
p = (
gg.ggplot(data=X, mapping=gg.aes(x="x1", y="x2", color="cluster")) +
gg.geom_point() +
gg.geom_point(data=centers, mapping=gg.aes(x="x1", y="x2", color="cluster"), size=3) +
gg.coord_fixed() +
gg.theme_minimal() +
gg.scale_color_manual(values=["#c20305", "#377eb8", "#4daf4a"])
)
p