kmeans clustering fromscratch
kmeans clustering - fromscratch
A numpy implementation of kmeans clustering
Useful links
Code
def kmeansclustering(obs:np.array, n_clusters:int, maxiter:int=100, maxflat:int=5, tol:float=0.01):
np.random.seed(6)
# initialise the centroids as random points selected from the dataset
initial_centroids_i = np.random.choice(obs.shape[0], n_clusters, replace=False)
centroids = obs[initial_centroids_i,:]
# initialise the distance matrix as an empty array with rows = tot obs and columns = n_clusters,
# distance_matrix[i][j] will be distance of obs i from cluster j
distance_matrix = np.zeros((obs.shape[0], n_clusters))
# keep track of the average distance between points and their cluster centroid
# the stopping rule for the loop is a variation in avg. distance < tolerance level "tol" or a number
# of iterations > maxiter
prev_distance = np.inf
flat = 0
count = 0
while flat < maxflat and count < maxiter:
for c in range(n_clusters):
# save distance from obs to each of the centroids into distance matrix
distance_matrix[:, c] = np.linalg.norm(obs - centroids[c], axis=1)
# the obs is assigned to the cluster represented by the nearest centroid
clusters = np.argmin(distance_matrix, axis=1)
# average distance between points and their cluster centroid
avg_distance = np.mean(distance_matrix[range(len(distance_matrix)), clusters])
# if variation < tol this iteration counts as a "flat" iteration
if abs(prev_distance - avg_distance) < tol:
flat += 1
prev_distance = avg_distance
for c in range(n_clusters):
# compute the new centroids as clusters means
cluster_obs = obs[np.where(clusters == c)[0], :]
centroids[c] = np.mean(cluster_obs, axis=0)
return clusters