Vous êtes sur la page 1sur 3

import random

import math

NUM_CLUSTERS = 2
TOTAL_DATA = 7
LOWEST_SAMPLE_POINT = 0 #element 0 of SAMPLES.
HIGHEST_SAMPLE_POINT = 3 #element 3 of SAMPLES.
BIG_NUMBER = math.pow(10, 10)

SAMPLES = [[1.0, 1.0], [1.5, 2.0], [3.0, 4.0], [5.0, 7.0], [3.5, 5.0], [4.5,
5.0], [3.5, 4.5]]

data = []
centroids = []

class DataPoint:
def __init__(self, x, y):
self.x = x
self.y = y

def set_x(self, x):


self.x = x

def get_x(self):
return self.x

def set_y(self, y):


self.y = y

def get_y(self):
return self.y

def set_cluster(self, clusterNumber):


self.clusterNumber = clusterNumber

def get_cluster(self):
return self.clusterNumber

class Centroid:
def __init__(self, x, y):
self.x = x
self.y = y

def set_x(self, x):


self.x = x

def get_x(self):
return self.x

def set_y(self, y):


self.y = y

def get_y(self):
return self.y
def initialize_centroids():
# Set the centoid coordinates to match the data points furthest from each
other.
# In this example, (1.0, 1.0) and (5.0, 7.0)
centroids.append(Centroid(SAMPLES[LOWEST_SAMPLE_POINT][0],
SAMPLES[LOWEST_SAMPLE_POINT][1]))
centroids.append(Centroid(SAMPLES[HIGHEST_SAMPLE_POINT][0],
SAMPLES[HIGHEST_SAMPLE_POINT][1]))

print("Centroids initialized at:")


print("(", centroids[0].get_x(), ", ", centroids[0].get_y(), ")")
print("(", centroids[1].get_x(), ", ", centroids[1].get_y(), ")")
print()
return

def initialize_datapoints():
# DataPoint objects' x and y values are taken from the SAMPLE array.
# The DataPoints associated with LOWEST_SAMPLE_POINT and
HIGHEST_SAMPLE_POINT are initially
# assigned to the clusters matching the LOWEST_SAMPLE_POINT and
HIGHEST_SAMPLE_POINT centroids.
for i in range(TOTAL_DATA):
newPoint = DataPoint(SAMPLES[i][0], SAMPLES[i][1])

if(i == LOWEST_SAMPLE_POINT):
newPoint.set_cluster(0)
elif(i == HIGHEST_SAMPLE_POINT):
newPoint.set_cluster(1)
else:
newPoint.set_cluster(None)

data.append(newPoint)

return

def get_distance(dataPointX, dataPointY, centroidX, centroidY):


# Calculate Euclidean distance.
return math.sqrt(math.pow((centroidY - dataPointY), 2) +
math.pow((centroidX - dataPointX), 2))

def recalculate_centroids():
totalX = 0
totalY = 0
totalInCluster = 0

for j in range(NUM_CLUSTERS):
for k in range(len(data)):
if(data[k].get_cluster() == j):
totalX += data[k].get_x()
totalY += data[k].get_y()
totalInCluster += 1

if(totalInCluster > 0):


centroids[j].set_x(totalX / totalInCluster)
centroids[j].set_y(totalY / totalInCluster)
return

def update_clusters():
isStillMoving = 0

for i in range(TOTAL_DATA):
bestMinimum = BIG_NUMBER
currentCluster = 0

for j in range(NUM_CLUSTERS):
distance = get_distance(data[i].get_x(), data[i].get_y(),
centroids[j].get_x(), centroids[j].get_y())
if(distance < bestMinimum):
bestMinimum = distance
currentCluster = j

data[i].set_cluster(currentCluster)

if(data[i].get_cluster() is None or data[i].get_cluster() !=


currentCluster):
data[i].set_cluster(currentCluster)
isStillMoving = 1

return isStillMoving

def perform_kmeans():
isStillMoving = 1

initialize_centroids()

initialize_datapoints()

while(isStillMoving):
recalculate_centroids()
isStillMoving = update_clusters()

return

def print_results():
for i in range(NUM_CLUSTERS):
print("Cluster ", i, " includes:")
for j in range(TOTAL_DATA):
if(data[j].get_cluster() == i):
print("(", data[j].get_x(), ", ", data[j].get_y(), ")")
print()

return

perform_kmeans()
print_results()

Vous aimerez peut-être aussi