kmeans clustering fromscratch

kmeans clustering - fromscratch

A numpy implementation of kmeans clustering

Useful links

Code

def kmeansclustering(obs:np.array, n_clusters:int, maxiter:int=100, maxflat:int=5, tol:float=0.01):
    np.random.seed(6)

    # initialise the centroids as random points selected from the dataset
    initial_centroids_i = np.random.choice(obs.shape[0], n_clusters, replace=False)  
    centroids = obs[initial_centroids_i,:]

    # initialise the distance matrix as an empty array with rows = tot obs and columns = n_clusters, 
    # distance_matrix[i][j] will be distance of obs i from cluster j
    distance_matrix = np.zeros((obs.shape[0], n_clusters))

    # keep track of the average distance between points and their cluster centroid
    # the stopping rule for the loop is a variation in avg. distance < tolerance level "tol" or a number
    # of iterations > maxiter
    prev_distance = np.inf
    flat = 0
    count = 0

    while flat < maxflat and count < maxiter:
        for c in range(n_clusters):
            # save distance from obs to each of the centroids into distance matrix
            distance_matrix[:, c] = np.linalg.norm(obs - centroids[c], axis=1)

        # the obs is assigned to the cluster represented by the nearest centroid
        clusters = np.argmin(distance_matrix, axis=1)

        # average distance between points and their cluster centroid
        avg_distance = np.mean(distance_matrix[range(len(distance_matrix)), clusters])

        # if variation < tol this iteration counts as a "flat" iteration
        if abs(prev_distance - avg_distance) < tol:
            flat += 1

        prev_distance = avg_distance

        for c in range(n_clusters):
            # compute the new centroids as clusters means
            cluster_obs = obs[np.where(clusters == c)[0], :]
            centroids[c] = np.mean(cluster_obs, axis=0)
    
    return clusters

Github repository

https://github.com/AndreaZoccatelli/kmeans_fromscratch

Previousknn fromscratch NextFrom law of cosines to cosine similarity

Code

def kmeansclustering(obs:np.array, n_clusters:int, maxiter:int=100, maxflat:int=5, tol:float=0.01):
    np.random.seed(6)

    # initialise the centroids as random points selected from the dataset
    initial_centroids_i = np.random.choice(obs.shape[0], n_clusters, replace=False)  
    centroids = obs[initial_centroids_i,:]

    # initialise the distance matrix as an empty array with rows = tot obs and columns = n_clusters, 
    # distance_matrix[i][j] will be distance of obs i from cluster j
    distance_matrix = np.zeros((obs.shape[0], n_clusters))

    # keep track of the average distance between points and their cluster centroid
    # the stopping rule for the loop is a variation in avg. distance < tolerance level "tol" or a number
    # of iterations > maxiter
    prev_distance = np.inf
    flat = 0
    count = 0

    while flat < maxflat and count < maxiter:
        for c in range(n_clusters):
            # save distance from obs to each of the centroids into distance matrix
            distance_matrix[:, c] = np.linalg.norm(obs - centroids[c], axis=1)

        # the obs is assigned to the cluster represented by the nearest centroid
        clusters = np.argmin(distance_matrix, axis=1)

        # average distance between points and their cluster centroid
        avg_distance = np.mean(distance_matrix[range(len(distance_matrix)), clusters])

        # if variation < tol this iteration counts as a "flat" iteration
        if abs(prev_distance - avg_distance) < tol:
            flat += 1

        prev_distance = avg_distance

        for c in range(n_clusters):
            # compute the new centroids as clusters means
            cluster_obs = obs[np.where(clusters == c)[0], :]
            centroids[c] = np.mean(cluster_obs, axis=0)
    
    return clusters