Skip to content

fix: K-means update for Tensorflow 2 #12865

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 51 additions & 138 deletions dynamic_programming/k_means_clustering_tensorflow.py
Original file line number Diff line number Diff line change
@@ -1,146 +1,59 @@
from random import shuffle

Check failure on line 1 in dynamic_programming/k_means_clustering_tensorflow.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

dynamic_programming/k_means_clustering_tensorflow.py:1:20: F401 `random.shuffle` imported but unused

import tensorflow as tf
from numpy import array
import numpy as np

Check failure on line 3 in dynamic_programming/k_means_clustering_tensorflow.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

dynamic_programming/k_means_clustering_tensorflow.py:3:17: F401 `numpy` imported but unused

Check failure on line 3 in dynamic_programming/k_means_clustering_tensorflow.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

dynamic_programming/k_means_clustering_tensorflow.py:1:1: I001 Import block is un-sorted or un-formatted


def tf_k_means_cluster(vectors, noofclusters):
def tf_k_means_clustering(vectors, noofclusters, max_iterations=100, tolerance=1e-4):
"""
K-Means Clustering using TensorFlow.
'vectors' should be a n*k 2-D NumPy array, where n is the number
of vectors of dimensionality k.
'noofclusters' should be an integer.
Performs K-means clustering using a fixed and efficient vectorized approach, using Tensorflow 2.x

Check failure on line 8 in dynamic_programming/k_means_clustering_tensorflow.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

dynamic_programming/k_means_clustering_tensorflow.py:8:89: E501 Line too long (101 > 88)
"""

vectors = tf.constant(vectors, dtype=tf.float32)
noofclusters = int(noofclusters)
assert noofclusters < len(vectors)

# Find out the dimensionality
dim = len(vectors[0])

# Will help select random centroids from among the available vectors
vector_indices = list(range(len(vectors)))
shuffle(vector_indices)

# GRAPH OF COMPUTATION
# We initialize a new graph and set it as the default during each run
# of this algorithm. This ensures that as this function is called
# multiple times, the default graph doesn't keep getting crowded with
# unused ops and Variables from previous function calls.

graph = tf.Graph()

with graph.as_default():
# SESSION OF COMPUTATION

sess = tf.Session()

##CONSTRUCTING THE ELEMENTS OF COMPUTATION

##First lets ensure we have a Variable vector for each centroid,
##initialized to one of the vectors from the available data points
centroids = [
tf.Variable(vectors[vector_indices[i]]) for i in range(noofclusters)
]
##These nodes will assign the centroid Variables the appropriate
##values
centroid_value = tf.placeholder("float64", [dim])
cent_assigns = []
for centroid in centroids:
cent_assigns.append(tf.assign(centroid, centroid_value))

##Variables for cluster assignments of individual vectors(initialized
##to 0 at first)
assignments = [tf.Variable(0) for i in range(len(vectors))]
##These nodes will assign an assignment Variable the appropriate
##value
assignment_value = tf.placeholder("int32")
cluster_assigns = []
for assignment in assignments:
cluster_assigns.append(tf.assign(assignment, assignment_value))

##Now lets construct the node that will compute the mean
# The placeholder for the input
mean_input = tf.placeholder("float", [None, dim])
# The Node/op takes the input and computes a mean along the 0th
# dimension, i.e. the list of input vectors
mean_op = tf.reduce_mean(mean_input, 0)

##Node for computing Euclidean distances
# Placeholders for input
v1 = tf.placeholder("float", [dim])
v2 = tf.placeholder("float", [dim])
euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(v1, v2), 2)))

##This node will figure out which cluster to assign a vector to,
##based on Euclidean distances of the vector from the centroids.
# Placeholder for input
centroid_distances = tf.placeholder("float", [noofclusters])
cluster_assignment = tf.argmin(centroid_distances, 0)

##INITIALIZING STATE VARIABLES

##This will help initialization of all Variables defined with respect
##to the graph. The Variable-initializer should be defined after
##all the Variables have been constructed, so that each of them
##will be included in the initialization.
init_op = tf.initialize_all_variables()

# Initialize all variables
sess.run(init_op)

##CLUSTERING ITERATIONS

# Now perform the Expectation-Maximization steps of K-Means clustering
# iterations. To keep things simple, we will only do a set number of
# iterations, instead of using a Stopping Criterion.
noofiterations = 100
for _ in range(noofiterations):
##EXPECTATION STEP
##Based on the centroid locations till last iteration, compute
##the _expected_ centroid assignments.
# Iterate over each vector
for vector_n in range(len(vectors)):
vect = vectors[vector_n]
# Compute Euclidean distance between this vector and each
# centroid. Remember that this list cannot be named
#'centroid_distances', since that is the input to the
# cluster assignment node.
distances = [
sess.run(euclid_dist, feed_dict={v1: vect, v2: sess.run(centroid)})
for centroid in centroids
]
# Now use the cluster assignment node, with the distances
# as the input
assignment = sess.run(
cluster_assignment, feed_dict={centroid_distances: distances}
)
# Now assign the value to the appropriate state variable
sess.run(
cluster_assigns[vector_n], feed_dict={assignment_value: assignment}
)

##MAXIMIZATION STEP
# Based on the expected state computed from the Expectation Step,
# compute the locations of the centroids so as to maximize the
# overall objective of minimizing within-cluster Sum-of-Squares
for cluster_n in range(noofclusters):
# Collect all the vectors assigned to this cluster
assigned_vects = [
vectors[i]
for i in range(len(vectors))
if sess.run(assignments[i]) == cluster_n
]
# Compute new centroid ___location
new_location = sess.run(
mean_op, feed_dict={mean_input: array(assigned_vects)}
)
# Assign value to appropriate variable
sess.run(
cent_assigns[cluster_n], feed_dict={centroid_value: new_location}
)

# Return centroids and assignments
centroids = sess.run(centroids)
assignments = sess.run(assignments)
return centroids, assignments
num_data_points = tf.shape(vectors)[0]

if noofclusters > num_data_points:
raise ValueError(
"Number of clusters (k) cannot be greater than the number of data points."
)

# Initialize centroids randomly from first k(no: of clusters) elements from the shuffled data points

Check failure on line 20 in dynamic_programming/k_means_clustering_tensorflow.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

dynamic_programming/k_means_clustering_tensorflow.py:20:89: E501 Line too long (104 > 88)
initial_indices = tf.random.shuffle(tf.range(tf.shape(vectors)[0]))[:noofclusters]
centroids = tf.Variable(tf.gather(vectors, initial_indices))

@tf.function
def train_step():
# Find the closest centroid for each vector
distances_sq = tf.reduce_sum(
tf.square(tf.expand_dims(vectors, 1) - tf.expand_dims(centroids, 0)), 2
)
assignments = tf.argmin(distances_sq, axis=1)

# Recalculate centroids efficiently
sums = tf.math.unsorted_segment_sum(
vectors, assignments, num_segments=noofclusters
)
counts = tf.math.unsorted_segment_sum(
tf.ones_like(vectors), assignments, num_segments=noofclusters
)

# Avoid division by zero for empty clusters
new_centroids = sums / tf.maximum(counts, 1e-9)

# For empty clusters, keep the old centroid to prevent them from moving to the origin

Check failure on line 43 in dynamic_programming/k_means_clustering_tensorflow.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

dynamic_programming/k_means_clustering_tensorflow.py:43:89: E501 Line too long (93 > 88)
is_empty = tf.equal(tf.reduce_sum(counts, axis=1), 0)
new_centroids = tf.where(tf.expand_dims(is_empty, 1), centroids, new_centroids)

return assignments, new_centroids

# Main iteration loop
for i in range(max_iterations):

Check failure on line 50 in dynamic_programming/k_means_clustering_tensorflow.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (B007)

dynamic_programming/k_means_clustering_tensorflow.py:50:9: B007 Loop control variable `i` not used within loop body
old_centroids = tf.identity(centroids)
assignments, new_centroids_val = train_step()
centroids.assign(new_centroids_val)

# Check for convergence
if tf.reduce_sum(tf.square(old_centroids - centroids)) < tolerance:
break

return centroids.numpy(), assignments.numpy()
Loading