diff --git a/dynamic_programming/k_means_clustering_tensorflow.py b/dynamic_programming/k_means_clustering_tensorflow.py index 8d3f6f0dfbcb..d261c3bfb505 100644 --- a/dynamic_programming/k_means_clustering_tensorflow.py +++ b/dynamic_programming/k_means_clustering_tensorflow.py @@ -1,146 +1,59 @@ from random import shuffle - import tensorflow as tf -from numpy import array +import numpy as np -def tf_k_means_cluster(vectors, noofclusters): +def tf_k_means_clustering(vectors, noofclusters, max_iterations=100, tolerance=1e-4): """ - K-Means Clustering using TensorFlow. - 'vectors' should be a n*k 2-D NumPy array, where n is the number - of vectors of dimensionality k. - 'noofclusters' should be an integer. + Performs K-means clustering using a fixed and efficient vectorized approach, using Tensorflow 2.x """ + vectors = tf.constant(vectors, dtype=tf.float32) noofclusters = int(noofclusters) - assert noofclusters < len(vectors) - - # Find out the dimensionality - dim = len(vectors[0]) - - # Will help select random centroids from among the available vectors - vector_indices = list(range(len(vectors))) - shuffle(vector_indices) - - # GRAPH OF COMPUTATION - # We initialize a new graph and set it as the default during each run - # of this algorithm. This ensures that as this function is called - # multiple times, the default graph doesn't keep getting crowded with - # unused ops and Variables from previous function calls. - - graph = tf.Graph() - - with graph.as_default(): - # SESSION OF COMPUTATION - - sess = tf.Session() - - ##CONSTRUCTING THE ELEMENTS OF COMPUTATION - - ##First lets ensure we have a Variable vector for each centroid, - ##initialized to one of the vectors from the available data points - centroids = [ - tf.Variable(vectors[vector_indices[i]]) for i in range(noofclusters) - ] - ##These nodes will assign the centroid Variables the appropriate - ##values - centroid_value = tf.placeholder("float64", [dim]) - cent_assigns = [] - for centroid in centroids: - cent_assigns.append(tf.assign(centroid, centroid_value)) - - ##Variables for cluster assignments of individual vectors(initialized - ##to 0 at first) - assignments = [tf.Variable(0) for i in range(len(vectors))] - ##These nodes will assign an assignment Variable the appropriate - ##value - assignment_value = tf.placeholder("int32") - cluster_assigns = [] - for assignment in assignments: - cluster_assigns.append(tf.assign(assignment, assignment_value)) - - ##Now lets construct the node that will compute the mean - # The placeholder for the input - mean_input = tf.placeholder("float", [None, dim]) - # The Node/op takes the input and computes a mean along the 0th - # dimension, i.e. the list of input vectors - mean_op = tf.reduce_mean(mean_input, 0) - - ##Node for computing Euclidean distances - # Placeholders for input - v1 = tf.placeholder("float", [dim]) - v2 = tf.placeholder("float", [dim]) - euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(v1, v2), 2))) - - ##This node will figure out which cluster to assign a vector to, - ##based on Euclidean distances of the vector from the centroids. - # Placeholder for input - centroid_distances = tf.placeholder("float", [noofclusters]) - cluster_assignment = tf.argmin(centroid_distances, 0) - - ##INITIALIZING STATE VARIABLES - - ##This will help initialization of all Variables defined with respect - ##to the graph. The Variable-initializer should be defined after - ##all the Variables have been constructed, so that each of them - ##will be included in the initialization. - init_op = tf.initialize_all_variables() - - # Initialize all variables - sess.run(init_op) - - ##CLUSTERING ITERATIONS - - # Now perform the Expectation-Maximization steps of K-Means clustering - # iterations. To keep things simple, we will only do a set number of - # iterations, instead of using a Stopping Criterion. - noofiterations = 100 - for _ in range(noofiterations): - ##EXPECTATION STEP - ##Based on the centroid locations till last iteration, compute - ##the _expected_ centroid assignments. - # Iterate over each vector - for vector_n in range(len(vectors)): - vect = vectors[vector_n] - # Compute Euclidean distance between this vector and each - # centroid. Remember that this list cannot be named - #'centroid_distances', since that is the input to the - # cluster assignment node. - distances = [ - sess.run(euclid_dist, feed_dict={v1: vect, v2: sess.run(centroid)}) - for centroid in centroids - ] - # Now use the cluster assignment node, with the distances - # as the input - assignment = sess.run( - cluster_assignment, feed_dict={centroid_distances: distances} - ) - # Now assign the value to the appropriate state variable - sess.run( - cluster_assigns[vector_n], feed_dict={assignment_value: assignment} - ) - - ##MAXIMIZATION STEP - # Based on the expected state computed from the Expectation Step, - # compute the locations of the centroids so as to maximize the - # overall objective of minimizing within-cluster Sum-of-Squares - for cluster_n in range(noofclusters): - # Collect all the vectors assigned to this cluster - assigned_vects = [ - vectors[i] - for i in range(len(vectors)) - if sess.run(assignments[i]) == cluster_n - ] - # Compute new centroid location - new_location = sess.run( - mean_op, feed_dict={mean_input: array(assigned_vects)} - ) - # Assign value to appropriate variable - sess.run( - cent_assigns[cluster_n], feed_dict={centroid_value: new_location} - ) - - # Return centroids and assignments - centroids = sess.run(centroids) - assignments = sess.run(assignments) - return centroids, assignments + num_data_points = tf.shape(vectors)[0] + + if noofclusters > num_data_points: + raise ValueError( + "Number of clusters (k) cannot be greater than the number of data points." + ) + + # Initialize centroids randomly from first k(no: of clusters) elements from the shuffled data points + initial_indices = tf.random.shuffle(tf.range(tf.shape(vectors)[0]))[:noofclusters] + centroids = tf.Variable(tf.gather(vectors, initial_indices)) + + @tf.function + def train_step(): + # Find the closest centroid for each vector + distances_sq = tf.reduce_sum( + tf.square(tf.expand_dims(vectors, 1) - tf.expand_dims(centroids, 0)), 2 + ) + assignments = tf.argmin(distances_sq, axis=1) + + # Recalculate centroids efficiently + sums = tf.math.unsorted_segment_sum( + vectors, assignments, num_segments=noofclusters + ) + counts = tf.math.unsorted_segment_sum( + tf.ones_like(vectors), assignments, num_segments=noofclusters + ) + + # Avoid division by zero for empty clusters + new_centroids = sums / tf.maximum(counts, 1e-9) + + # For empty clusters, keep the old centroid to prevent them from moving to the origin + is_empty = tf.equal(tf.reduce_sum(counts, axis=1), 0) + new_centroids = tf.where(tf.expand_dims(is_empty, 1), centroids, new_centroids) + + return assignments, new_centroids + + # Main iteration loop + for i in range(max_iterations): + old_centroids = tf.identity(centroids) + assignments, new_centroids_val = train_step() + centroids.assign(new_centroids_val) + + # Check for convergence + if tf.reduce_sum(tf.square(old_centroids - centroids)) < tolerance: + break + + return centroids.numpy(), assignments.numpy()