From b78dfe3c237769ced9957b8a49713f8c5e4ec614 Mon Sep 17 00:00:00 2001 From: Raisa Jose <168289547+raisaaajose@users.noreply.github.com> Date: Tue, 29 Jul 2025 16:34:14 -0700 Subject: [PATCH 1/4] feat: Update for TensorFlow 2, add doctests Updated K-means clustering implementation for TensorFlow 2.x compatibility. Doctests have been added for code clarity and testing. Core algorithm logic and structure remain unchanged. --- .../k_means_clustering_tensorflow.py | 210 ++++++------------ 1 file changed, 72 insertions(+), 138 deletions(-) diff --git a/dynamic_programming/k_means_clustering_tensorflow.py b/dynamic_programming/k_means_clustering_tensorflow.py index 8d3f6f0dfbcb..43a67503133c 100644 --- a/dynamic_programming/k_means_clustering_tensorflow.py +++ b/dynamic_programming/k_means_clustering_tensorflow.py @@ -1,146 +1,80 @@ from random import shuffle - import tensorflow as tf -from numpy import array +import numpy as np -def tf_k_means_cluster(vectors, noofclusters): +def tf_k_means_cluster_fixed(vectors, noofclusters,max_iterations = 100,tolerance = 1e-4): """ - K-Means Clustering using TensorFlow. - 'vectors' should be a n*k 2-D NumPy array, where n is the number - of vectors of dimensionality k. - 'noofclusters' should be an integer. + Performs K-means clustering using a fixed and efficient vectorized approach, using Tensorflow 2.x + + Parameters: + vectors (list): A list of vectors. + noofclusters (int): The number of clusters (k). + max_iterations(int): maximum number of iterations or how many times the algorithm will refine its cluster assignments and centroid positions, until convergence. + tolerance(int): defines a convergence criterion. The K-means algorithm stops when the centroids move less than this tolerance value between consecutive iterations. + + (set same random seed in all examples for reproducibility) + >>>tf.random.set_seed(42) + + Example 1: + >>>data2 = np.array([[0.0, 0.0], [0.1, 0.1], [10.0, 10.0]], dtype=np.float32) + >>>centroids2, assignments2 = tf_k_means_cluster_fixed(data2, 2) + >>>print(centroids2,assignments2) + [[ 0.05 0.05] + [10. 10. ]] [0 0 1] + + Example 2 (Idential data points): + >>>data_identical = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32) + >>>centroids, assignments = tf_k_means_cluster_fixed(data_identical, 1) + >>>print(centroids,assignments) + + Example 3 (k>N): + >>>data = np.array([[0.0, 0.0], [0.9, 0.9], [13.0, 15.0]], dtype=np.float32) + >>>centroids, assignments = tf_k_means_cluster_fixed(data, 5) + >>>print(centroids,assignments) """ + + vectors = tf.constant(vectors, dtype=tf.float32) noofclusters = int(noofclusters) - assert noofclusters < len(vectors) - - # Find out the dimensionality - dim = len(vectors[0]) - - # Will help select random centroids from among the available vectors - vector_indices = list(range(len(vectors))) - shuffle(vector_indices) - - # GRAPH OF COMPUTATION - # We initialize a new graph and set it as the default during each run - # of this algorithm. This ensures that as this function is called - # multiple times, the default graph doesn't keep getting crowded with - # unused ops and Variables from previous function calls. - - graph = tf.Graph() - - with graph.as_default(): - # SESSION OF COMPUTATION - - sess = tf.Session() - - ##CONSTRUCTING THE ELEMENTS OF COMPUTATION - - ##First lets ensure we have a Variable vector for each centroid, - ##initialized to one of the vectors from the available data points - centroids = [ - tf.Variable(vectors[vector_indices[i]]) for i in range(noofclusters) - ] - ##These nodes will assign the centroid Variables the appropriate - ##values - centroid_value = tf.placeholder("float64", [dim]) - cent_assigns = [] - for centroid in centroids: - cent_assigns.append(tf.assign(centroid, centroid_value)) - - ##Variables for cluster assignments of individual vectors(initialized - ##to 0 at first) - assignments = [tf.Variable(0) for i in range(len(vectors))] - ##These nodes will assign an assignment Variable the appropriate - ##value - assignment_value = tf.placeholder("int32") - cluster_assigns = [] - for assignment in assignments: - cluster_assigns.append(tf.assign(assignment, assignment_value)) - - ##Now lets construct the node that will compute the mean - # The placeholder for the input - mean_input = tf.placeholder("float", [None, dim]) - # The Node/op takes the input and computes a mean along the 0th - # dimension, i.e. the list of input vectors - mean_op = tf.reduce_mean(mean_input, 0) - - ##Node for computing Euclidean distances - # Placeholders for input - v1 = tf.placeholder("float", [dim]) - v2 = tf.placeholder("float", [dim]) - euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(v1, v2), 2))) - - ##This node will figure out which cluster to assign a vector to, - ##based on Euclidean distances of the vector from the centroids. - # Placeholder for input - centroid_distances = tf.placeholder("float", [noofclusters]) - cluster_assignment = tf.argmin(centroid_distances, 0) - - ##INITIALIZING STATE VARIABLES - - ##This will help initialization of all Variables defined with respect - ##to the graph. The Variable-initializer should be defined after - ##all the Variables have been constructed, so that each of them - ##will be included in the initialization. - init_op = tf.initialize_all_variables() - - # Initialize all variables - sess.run(init_op) - - ##CLUSTERING ITERATIONS - - # Now perform the Expectation-Maximization steps of K-Means clustering - # iterations. To keep things simple, we will only do a set number of - # iterations, instead of using a Stopping Criterion. - noofiterations = 100 - for _ in range(noofiterations): - ##EXPECTATION STEP - ##Based on the centroid locations till last iteration, compute - ##the _expected_ centroid assignments. - # Iterate over each vector - for vector_n in range(len(vectors)): - vect = vectors[vector_n] - # Compute Euclidean distance between this vector and each - # centroid. Remember that this list cannot be named - #'centroid_distances', since that is the input to the - # cluster assignment node. - distances = [ - sess.run(euclid_dist, feed_dict={v1: vect, v2: sess.run(centroid)}) - for centroid in centroids - ] - # Now use the cluster assignment node, with the distances - # as the input - assignment = sess.run( - cluster_assignment, feed_dict={centroid_distances: distances} - ) - # Now assign the value to the appropriate state variable - sess.run( - cluster_assigns[vector_n], feed_dict={assignment_value: assignment} - ) - - ##MAXIMIZATION STEP - # Based on the expected state computed from the Expectation Step, - # compute the locations of the centroids so as to maximize the - # overall objective of minimizing within-cluster Sum-of-Squares - for cluster_n in range(noofclusters): - # Collect all the vectors assigned to this cluster - assigned_vects = [ - vectors[i] - for i in range(len(vectors)) - if sess.run(assignments[i]) == cluster_n - ] - # Compute new centroid location - new_location = sess.run( - mean_op, feed_dict={mean_input: array(assigned_vects)} - ) - # Assign value to appropriate variable - sess.run( - cent_assigns[cluster_n], feed_dict={centroid_value: new_location} - ) - - # Return centroids and assignments - centroids = sess.run(centroids) - assignments = sess.run(assignments) - return centroids, assignments + num_data_points = tf.shape(vectors)[0] + + if noofclusters > num_data_points: + raise ValueError("Number of clusters (k) cannot be greater than the number of data points.") + + # Initialize centroids randomly from first k(no: of clusters) elements from the shuffled data points + initial_indices = tf.random.shuffle(tf.range(tf.shape(vectors)[0]))[:noofclusters] + centroids = tf.Variable(tf.gather(vectors, initial_indices)) + + @tf.function + def train_step(): + # Find the closest centroid for each vector + distances_sq = tf.reduce_sum( + tf.square(tf.expand_dims(vectors, 1) - tf.expand_dims(centroids, 0)), 2 + ) + assignments = tf.argmin(distances_sq, axis=1) + + #Recalculate centroids efficiently + sums = tf.math.unsorted_segment_sum(vectors, assignments, num_segments=noofclusters) + counts = tf.math.unsorted_segment_sum(tf.ones_like(vectors), assignments, num_segments=noofclusters) + + # Avoid division by zero for empty clusters + new_centroids = sums / tf.maximum(counts, 1e-9) + + # For empty clusters, keep the old centroid to prevent them from moving to the origin + is_empty = tf.equal(tf.reduce_sum(counts, axis=1), 0) + new_centroids = tf.where(tf.expand_dims(is_empty, 1), centroids, new_centroids) + + return assignments, new_centroids + + # Main iteration loop + for i in range(max_iterations): + old_centroids = tf.identity(centroids) + assignments, new_centroids_val = train_step() + centroids.assign(new_centroids_val) + + # Check for convergence + if tf.reduce_sum(tf.square(old_centroids - centroids)) < tolerance: + break + + return centroids.numpy(), assignments.numpy() From 7e2baa57f1cb48ced8d4ed2d6bef09ce4c169f6f Mon Sep 17 00:00:00 2001 From: Raisa Jose <168289547+raisaaajose@users.noreply.github.com> Date: Tue, 29 Jul 2025 16:41:16 -0700 Subject: [PATCH 2/4] fix: renamed function --- dynamic_programming/k_means_clustering_tensorflow.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dynamic_programming/k_means_clustering_tensorflow.py b/dynamic_programming/k_means_clustering_tensorflow.py index 43a67503133c..deae982ab879 100644 --- a/dynamic_programming/k_means_clustering_tensorflow.py +++ b/dynamic_programming/k_means_clustering_tensorflow.py @@ -3,7 +3,7 @@ import numpy as np -def tf_k_means_cluster_fixed(vectors, noofclusters,max_iterations = 100,tolerance = 1e-4): +def tf_k_means_clustering(vectors, noofclusters,max_iterations = 100,tolerance = 1e-4): """ Performs K-means clustering using a fixed and efficient vectorized approach, using Tensorflow 2.x @@ -18,19 +18,19 @@ def tf_k_means_cluster_fixed(vectors, noofclusters,max_iterations = 100,toleranc Example 1: >>>data2 = np.array([[0.0, 0.0], [0.1, 0.1], [10.0, 10.0]], dtype=np.float32) - >>>centroids2, assignments2 = tf_k_means_cluster_fixed(data2, 2) + >>>centroids2, assignments2 = tf_k_means_clustering(data2, 2) >>>print(centroids2,assignments2) [[ 0.05 0.05] [10. 10. ]] [0 0 1] Example 2 (Idential data points): >>>data_identical = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32) - >>>centroids, assignments = tf_k_means_cluster_fixed(data_identical, 1) + >>>centroids, assignments = tf_k_means_clustering(data_identical, 1) >>>print(centroids,assignments) Example 3 (k>N): >>>data = np.array([[0.0, 0.0], [0.9, 0.9], [13.0, 15.0]], dtype=np.float32) - >>>centroids, assignments = tf_k_means_cluster_fixed(data, 5) + >>>centroids, assignments = tf_k_means_clustering(data, 5) >>>print(centroids,assignments) """ From aa55cbac3159abfa5f6354c7b5524401b4faf234 Mon Sep 17 00:00:00 2001 From: Raisa Jose <168289547+raisaaajose@users.noreply.github.com> Date: Tue, 29 Jul 2025 16:46:03 -0700 Subject: [PATCH 3/4] fix: removed doctests for first PR --- .../k_means_clustering_tensorflow.py | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/dynamic_programming/k_means_clustering_tensorflow.py b/dynamic_programming/k_means_clustering_tensorflow.py index deae982ab879..9e4a30759410 100644 --- a/dynamic_programming/k_means_clustering_tensorflow.py +++ b/dynamic_programming/k_means_clustering_tensorflow.py @@ -6,32 +6,6 @@ def tf_k_means_clustering(vectors, noofclusters,max_iterations = 100,tolerance = 1e-4): """ Performs K-means clustering using a fixed and efficient vectorized approach, using Tensorflow 2.x - - Parameters: - vectors (list): A list of vectors. - noofclusters (int): The number of clusters (k). - max_iterations(int): maximum number of iterations or how many times the algorithm will refine its cluster assignments and centroid positions, until convergence. - tolerance(int): defines a convergence criterion. The K-means algorithm stops when the centroids move less than this tolerance value between consecutive iterations. - - (set same random seed in all examples for reproducibility) - >>>tf.random.set_seed(42) - - Example 1: - >>>data2 = np.array([[0.0, 0.0], [0.1, 0.1], [10.0, 10.0]], dtype=np.float32) - >>>centroids2, assignments2 = tf_k_means_clustering(data2, 2) - >>>print(centroids2,assignments2) - [[ 0.05 0.05] - [10. 10. ]] [0 0 1] - - Example 2 (Idential data points): - >>>data_identical = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32) - >>>centroids, assignments = tf_k_means_clustering(data_identical, 1) - >>>print(centroids,assignments) - - Example 3 (k>N): - >>>data = np.array([[0.0, 0.0], [0.9, 0.9], [13.0, 15.0]], dtype=np.float32) - >>>centroids, assignments = tf_k_means_clustering(data, 5) - >>>print(centroids,assignments) """ From 0f9d2152a098862dbdb9b4f6dc2a6a083cadfbc5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 30 Jul 2025 00:50:52 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../k_means_clustering_tensorflow.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/dynamic_programming/k_means_clustering_tensorflow.py b/dynamic_programming/k_means_clustering_tensorflow.py index 9e4a30759410..d261c3bfb505 100644 --- a/dynamic_programming/k_means_clustering_tensorflow.py +++ b/dynamic_programming/k_means_clustering_tensorflow.py @@ -3,18 +3,19 @@ import numpy as np -def tf_k_means_clustering(vectors, noofclusters,max_iterations = 100,tolerance = 1e-4): +def tf_k_means_clustering(vectors, noofclusters, max_iterations=100, tolerance=1e-4): """ Performs K-means clustering using a fixed and efficient vectorized approach, using Tensorflow 2.x """ - vectors = tf.constant(vectors, dtype=tf.float32) noofclusters = int(noofclusters) num_data_points = tf.shape(vectors)[0] if noofclusters > num_data_points: - raise ValueError("Number of clusters (k) cannot be greater than the number of data points.") + raise ValueError( + "Number of clusters (k) cannot be greater than the number of data points." + ) # Initialize centroids randomly from first k(no: of clusters) elements from the shuffled data points initial_indices = tf.random.shuffle(tf.range(tf.shape(vectors)[0]))[:noofclusters] @@ -28,9 +29,13 @@ def train_step(): ) assignments = tf.argmin(distances_sq, axis=1) - #Recalculate centroids efficiently - sums = tf.math.unsorted_segment_sum(vectors, assignments, num_segments=noofclusters) - counts = tf.math.unsorted_segment_sum(tf.ones_like(vectors), assignments, num_segments=noofclusters) + # Recalculate centroids efficiently + sums = tf.math.unsorted_segment_sum( + vectors, assignments, num_segments=noofclusters + ) + counts = tf.math.unsorted_segment_sum( + tf.ones_like(vectors), assignments, num_segments=noofclusters + ) # Avoid division by zero for empty clusters new_centroids = sums / tf.maximum(counts, 1e-9)