From b78dfe3c237769ced9957b8a49713f8c5e4ec614 Mon Sep 17 00:00:00 2001
From: Raisa Jose <168289547+raisaaajose@users.noreply.github.com>
Date: Tue, 29 Jul 2025 16:34:14 -0700
Subject: [PATCH 1/4] feat: Update for TensorFlow 2, add doctests

Updated K-means clustering implementation for TensorFlow 2.x compatibility. Doctests have been added for code clarity and testing. Core algorithm logic and structure remain unchanged.
---
 .../k_means_clustering_tensorflow.py          | 210 ++++++------------
 1 file changed, 72 insertions(+), 138 deletions(-)

diff --git a/dynamic_programming/k_means_clustering_tensorflow.py b/dynamic_programming/k_means_clustering_tensorflow.py
index 8d3f6f0dfbcb..43a67503133c 100644
--- a/dynamic_programming/k_means_clustering_tensorflow.py
+++ b/dynamic_programming/k_means_clustering_tensorflow.py
@@ -1,146 +1,80 @@
 from random import shuffle
-
 import tensorflow as tf
-from numpy import array
+import numpy as np
 
 
-def tf_k_means_cluster(vectors, noofclusters):
+def tf_k_means_cluster_fixed(vectors, noofclusters,max_iterations = 100,tolerance = 1e-4):
     """
-    K-Means Clustering using TensorFlow.
-    'vectors' should be a n*k 2-D NumPy array, where n is the number
-    of vectors of dimensionality k.
-    'noofclusters' should be an integer.
+    Performs K-means clustering using a fixed and efficient vectorized approach, using Tensorflow 2.x
+
+    Parameters:
+    vectors (list): A list of vectors.
+    noofclusters (int): The number of clusters (k).
+    max_iterations(int): maximum number of iterations or how many times the algorithm will refine its cluster assignments and centroid positions, until convergence.
+    tolerance(int): defines a convergence criterion. The K-means algorithm stops when the centroids move less than this tolerance value between consecutive iterations.
+
+    (set same random seed in all examples for reproducibility)
+    >>>tf.random.set_seed(42)
+
+    Example 1:
+    >>>data2 = np.array([[0.0, 0.0], [0.1, 0.1], [10.0, 10.0]], dtype=np.float32)
+    >>>centroids2, assignments2 = tf_k_means_cluster_fixed(data2, 2)
+    >>>print(centroids2,assignments2)
+    [[ 0.05  0.05]
+    [10.   10.  ]] [0 0 1]
+
+    Example 2 (Idential data points):
+    >>>data_identical = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32)
+    >>>centroids, assignments = tf_k_means_cluster_fixed(data_identical, 1)
+    >>>print(centroids,assignments)
+
+    Example 3 (k>N):
+    >>>data = np.array([[0.0, 0.0], [0.9, 0.9], [13.0, 15.0]], dtype=np.float32)
+    >>>centroids, assignments = tf_k_means_cluster_fixed(data, 5)
+    >>>print(centroids,assignments)
     """
 
+
+    vectors = tf.constant(vectors, dtype=tf.float32)
     noofclusters = int(noofclusters)
-    assert noofclusters < len(vectors)
-
-    # Find out the dimensionality
-    dim = len(vectors[0])
-
-    # Will help select random centroids from among the available vectors
-    vector_indices = list(range(len(vectors)))
-    shuffle(vector_indices)
-
-    # GRAPH OF COMPUTATION
-    # We initialize a new graph and set it as the default during each run
-    # of this algorithm. This ensures that as this function is called
-    # multiple times, the default graph doesn't keep getting crowded with
-    # unused ops and Variables from previous function calls.
-
-    graph = tf.Graph()
-
-    with graph.as_default():
-        # SESSION OF COMPUTATION
-
-        sess = tf.Session()
-
-        ##CONSTRUCTING THE ELEMENTS OF COMPUTATION
-
-        ##First lets ensure we have a Variable vector for each centroid,
-        ##initialized to one of the vectors from the available data points
-        centroids = [
-            tf.Variable(vectors[vector_indices[i]]) for i in range(noofclusters)
-        ]
-        ##These nodes will assign the centroid Variables the appropriate
-        ##values
-        centroid_value = tf.placeholder("float64", [dim])
-        cent_assigns = []
-        for centroid in centroids:
-            cent_assigns.append(tf.assign(centroid, centroid_value))
-
-        ##Variables for cluster assignments of individual vectors(initialized
-        ##to 0 at first)
-        assignments = [tf.Variable(0) for i in range(len(vectors))]
-        ##These nodes will assign an assignment Variable the appropriate
-        ##value
-        assignment_value = tf.placeholder("int32")
-        cluster_assigns = []
-        for assignment in assignments:
-            cluster_assigns.append(tf.assign(assignment, assignment_value))
-
-        ##Now lets construct the node that will compute the mean
-        # The placeholder for the input
-        mean_input = tf.placeholder("float", [None, dim])
-        # The Node/op takes the input and computes a mean along the 0th
-        # dimension, i.e. the list of input vectors
-        mean_op = tf.reduce_mean(mean_input, 0)
-
-        ##Node for computing Euclidean distances
-        # Placeholders for input
-        v1 = tf.placeholder("float", [dim])
-        v2 = tf.placeholder("float", [dim])
-        euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(v1, v2), 2)))
-
-        ##This node will figure out which cluster to assign a vector to,
-        ##based on Euclidean distances of the vector from the centroids.
-        # Placeholder for input
-        centroid_distances = tf.placeholder("float", [noofclusters])
-        cluster_assignment = tf.argmin(centroid_distances, 0)
-
-        ##INITIALIZING STATE VARIABLES
-
-        ##This will help initialization of all Variables defined with respect
-        ##to the graph. The Variable-initializer should be defined after
-        ##all the Variables have been constructed, so that each of them
-        ##will be included in the initialization.
-        init_op = tf.initialize_all_variables()
-
-        # Initialize all variables
-        sess.run(init_op)
-
-        ##CLUSTERING ITERATIONS
-
-        # Now perform the Expectation-Maximization steps of K-Means clustering
-        # iterations. To keep things simple, we will only do a set number of
-        # iterations, instead of using a Stopping Criterion.
-        noofiterations = 100
-        for _ in range(noofiterations):
-            ##EXPECTATION STEP
-            ##Based on the centroid locations till last iteration, compute
-            ##the _expected_ centroid assignments.
-            # Iterate over each vector
-            for vector_n in range(len(vectors)):
-                vect = vectors[vector_n]
-                # Compute Euclidean distance between this vector and each
-                # centroid. Remember that this list cannot be named
-                #'centroid_distances', since that is the input to the
-                # cluster assignment node.
-                distances = [
-                    sess.run(euclid_dist, feed_dict={v1: vect, v2: sess.run(centroid)})
-                    for centroid in centroids
-                ]
-                # Now use the cluster assignment node, with the distances
-                # as the input
-                assignment = sess.run(
-                    cluster_assignment, feed_dict={centroid_distances: distances}
-                )
-                # Now assign the value to the appropriate state variable
-                sess.run(
-                    cluster_assigns[vector_n], feed_dict={assignment_value: assignment}
-                )
-
-            ##MAXIMIZATION STEP
-            # Based on the expected state computed from the Expectation Step,
-            # compute the locations of the centroids so as to maximize the
-            # overall objective of minimizing within-cluster Sum-of-Squares
-            for cluster_n in range(noofclusters):
-                # Collect all the vectors assigned to this cluster
-                assigned_vects = [
-                    vectors[i]
-                    for i in range(len(vectors))
-                    if sess.run(assignments[i]) == cluster_n
-                ]
-                # Compute new centroid location
-                new_location = sess.run(
-                    mean_op, feed_dict={mean_input: array(assigned_vects)}
-                )
-                # Assign value to appropriate variable
-                sess.run(
-                    cent_assigns[cluster_n], feed_dict={centroid_value: new_location}
-                )
-
-        # Return centroids and assignments
-        centroids = sess.run(centroids)
-        assignments = sess.run(assignments)
-        return centroids, assignments
+    num_data_points = tf.shape(vectors)[0]
+
+    if noofclusters > num_data_points:
+      raise ValueError("Number of clusters (k) cannot be greater than the number of data points.")
+
+    # Initialize centroids randomly from first k(no: of clusters) elements from the shuffled data points
+    initial_indices = tf.random.shuffle(tf.range(tf.shape(vectors)[0]))[:noofclusters]
+    centroids = tf.Variable(tf.gather(vectors, initial_indices))
+
+    @tf.function
+    def train_step():
+        # Find the closest centroid for each vector
+        distances_sq = tf.reduce_sum(
+            tf.square(tf.expand_dims(vectors, 1) - tf.expand_dims(centroids, 0)), 2
+        )
+        assignments = tf.argmin(distances_sq, axis=1)
+
+        #Recalculate centroids efficiently
+        sums = tf.math.unsorted_segment_sum(vectors, assignments, num_segments=noofclusters)
+        counts = tf.math.unsorted_segment_sum(tf.ones_like(vectors), assignments, num_segments=noofclusters)
+
+        # Avoid division by zero for empty clusters
+        new_centroids = sums / tf.maximum(counts, 1e-9)
+
+        # For empty clusters, keep the old centroid to prevent them from moving to the origin
+        is_empty = tf.equal(tf.reduce_sum(counts, axis=1), 0)
+        new_centroids = tf.where(tf.expand_dims(is_empty, 1), centroids, new_centroids)
+
+        return assignments, new_centroids
+
+    # Main iteration loop
+    for i in range(max_iterations):
+        old_centroids = tf.identity(centroids)
+        assignments, new_centroids_val = train_step()
+        centroids.assign(new_centroids_val)
+
+        # Check for convergence
+        if tf.reduce_sum(tf.square(old_centroids - centroids)) < tolerance:
+            break
+
+    return centroids.numpy(), assignments.numpy()

From 7e2baa57f1cb48ced8d4ed2d6bef09ce4c169f6f Mon Sep 17 00:00:00 2001
From: Raisa Jose <168289547+raisaaajose@users.noreply.github.com>
Date: Tue, 29 Jul 2025 16:41:16 -0700
Subject: [PATCH 2/4] fix: renamed function

---
 dynamic_programming/k_means_clustering_tensorflow.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dynamic_programming/k_means_clustering_tensorflow.py b/dynamic_programming/k_means_clustering_tensorflow.py
index 43a67503133c..deae982ab879 100644
--- a/dynamic_programming/k_means_clustering_tensorflow.py
+++ b/dynamic_programming/k_means_clustering_tensorflow.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 
-def tf_k_means_cluster_fixed(vectors, noofclusters,max_iterations = 100,tolerance = 1e-4):
+def tf_k_means_clustering(vectors, noofclusters,max_iterations = 100,tolerance = 1e-4):
     """
     Performs K-means clustering using a fixed and efficient vectorized approach, using Tensorflow 2.x
 
@@ -18,19 +18,19 @@ def tf_k_means_cluster_fixed(vectors, noofclusters,max_iterations = 100,toleranc
 
     Example 1:
     >>>data2 = np.array([[0.0, 0.0], [0.1, 0.1], [10.0, 10.0]], dtype=np.float32)
-    >>>centroids2, assignments2 = tf_k_means_cluster_fixed(data2, 2)
+    >>>centroids2, assignments2 = tf_k_means_clustering(data2, 2)
     >>>print(centroids2,assignments2)
     [[ 0.05  0.05]
     [10.   10.  ]] [0 0 1]
 
     Example 2 (Idential data points):
     >>>data_identical = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32)
-    >>>centroids, assignments = tf_k_means_cluster_fixed(data_identical, 1)
+    >>>centroids, assignments = tf_k_means_clustering(data_identical, 1)
     >>>print(centroids,assignments)
 
     Example 3 (k>N):
     >>>data = np.array([[0.0, 0.0], [0.9, 0.9], [13.0, 15.0]], dtype=np.float32)
-    >>>centroids, assignments = tf_k_means_cluster_fixed(data, 5)
+    >>>centroids, assignments = tf_k_means_clustering(data, 5)
     >>>print(centroids,assignments)
     """
 

From aa55cbac3159abfa5f6354c7b5524401b4faf234 Mon Sep 17 00:00:00 2001
From: Raisa Jose <168289547+raisaaajose@users.noreply.github.com>
Date: Tue, 29 Jul 2025 16:46:03 -0700
Subject: [PATCH 3/4] fix: removed doctests for first PR

---
 .../k_means_clustering_tensorflow.py          | 26 -------------------
 1 file changed, 26 deletions(-)

diff --git a/dynamic_programming/k_means_clustering_tensorflow.py b/dynamic_programming/k_means_clustering_tensorflow.py
index deae982ab879..9e4a30759410 100644
--- a/dynamic_programming/k_means_clustering_tensorflow.py
+++ b/dynamic_programming/k_means_clustering_tensorflow.py
@@ -6,32 +6,6 @@
 def tf_k_means_clustering(vectors, noofclusters,max_iterations = 100,tolerance = 1e-4):
     """
     Performs K-means clustering using a fixed and efficient vectorized approach, using Tensorflow 2.x
-
-    Parameters:
-    vectors (list): A list of vectors.
-    noofclusters (int): The number of clusters (k).
-    max_iterations(int): maximum number of iterations or how many times the algorithm will refine its cluster assignments and centroid positions, until convergence.
-    tolerance(int): defines a convergence criterion. The K-means algorithm stops when the centroids move less than this tolerance value between consecutive iterations.
-
-    (set same random seed in all examples for reproducibility)
-    >>>tf.random.set_seed(42)
-
-    Example 1:
-    >>>data2 = np.array([[0.0, 0.0], [0.1, 0.1], [10.0, 10.0]], dtype=np.float32)
-    >>>centroids2, assignments2 = tf_k_means_clustering(data2, 2)
-    >>>print(centroids2,assignments2)
-    [[ 0.05  0.05]
-    [10.   10.  ]] [0 0 1]
-
-    Example 2 (Idential data points):
-    >>>data_identical = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32)
-    >>>centroids, assignments = tf_k_means_clustering(data_identical, 1)
-    >>>print(centroids,assignments)
-
-    Example 3 (k>N):
-    >>>data = np.array([[0.0, 0.0], [0.9, 0.9], [13.0, 15.0]], dtype=np.float32)
-    >>>centroids, assignments = tf_k_means_clustering(data, 5)
-    >>>print(centroids,assignments)
     """
 
 

From 0f9d2152a098862dbdb9b4f6dc2a6a083cadfbc5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 30 Jul 2025 00:50:52 +0000
Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../k_means_clustering_tensorflow.py            | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/dynamic_programming/k_means_clustering_tensorflow.py b/dynamic_programming/k_means_clustering_tensorflow.py
index 9e4a30759410..d261c3bfb505 100644
--- a/dynamic_programming/k_means_clustering_tensorflow.py
+++ b/dynamic_programming/k_means_clustering_tensorflow.py
@@ -3,18 +3,19 @@
 import numpy as np
 
 
-def tf_k_means_clustering(vectors, noofclusters,max_iterations = 100,tolerance = 1e-4):
+def tf_k_means_clustering(vectors, noofclusters, max_iterations=100, tolerance=1e-4):
     """
     Performs K-means clustering using a fixed and efficient vectorized approach, using Tensorflow 2.x
     """
 
-
     vectors = tf.constant(vectors, dtype=tf.float32)
     noofclusters = int(noofclusters)
     num_data_points = tf.shape(vectors)[0]
 
     if noofclusters > num_data_points:
-      raise ValueError("Number of clusters (k) cannot be greater than the number of data points.")
+        raise ValueError(
+            "Number of clusters (k) cannot be greater than the number of data points."
+        )
 
     # Initialize centroids randomly from first k(no: of clusters) elements from the shuffled data points
     initial_indices = tf.random.shuffle(tf.range(tf.shape(vectors)[0]))[:noofclusters]
@@ -28,9 +29,13 @@ def train_step():
         )
         assignments = tf.argmin(distances_sq, axis=1)
 
-        #Recalculate centroids efficiently
-        sums = tf.math.unsorted_segment_sum(vectors, assignments, num_segments=noofclusters)
-        counts = tf.math.unsorted_segment_sum(tf.ones_like(vectors), assignments, num_segments=noofclusters)
+        # Recalculate centroids efficiently
+        sums = tf.math.unsorted_segment_sum(
+            vectors, assignments, num_segments=noofclusters
+        )
+        counts = tf.math.unsorted_segment_sum(
+            tf.ones_like(vectors), assignments, num_segments=noofclusters
+        )
 
         # Avoid division by zero for empty clusters
         new_centroids = sums / tf.maximum(counts, 1e-9)