Vous êtes sur la page 1sur 2

# mport random

import pandas as pd
import math
import numpy as np
from matplotlib import pyplot as plt

## data = pd.read_csv("BlackFriday.csv", sep=",", engine="python")

data.head()
finaltable=data.dropna()

k=2
centroids={}
l=[]

#print(str(finaltable))
for i in range(k):
centroids[i]=int(random.uniform(data['User_ID'].agg(pd.np.min),
data['User_ID'].agg(pd.np.max))),int(random.uniform(data['Purchase'].agg(pd.np.min)
, data['Purchase'].agg(pd.np.max)))
l.append([])

## #Range of input for algorithm

numcostumers = data['User_ID'].nunique()
numpurchases = data['User_ID'].count()
minpurchase = data['Purchase'].agg(pd.np.min)
maxpurchase = data['Purchase'].agg(pd.np.max)

#purchpuser = data.groupby('User_ID')['Purchase'].nunique()
meanpurchase = data['Purchase'].mean()
sd =data['Purchase'].std()
finalsizec1 = 0
finalsizec2 = 0
#print(purchpuser)

print(centroids[0])
print(centroids[1])

change=True

while change==True:
for j in finaltable['User_ID'].index:
best=[]
list1 = []

for t in range(len(centroids)):
euclidean1 = pow(int(data["User_ID"][j])-int(centroids[t][0]), 2)
euclidean2 = pow(int(data["Purchase"][j]) - int(centroids[t][1]), 2)

mean=math.sqrt(euclidean1+euclidean2)/k
best.append(mean)
list1.append((data["User_ID"][j],data["Purchase"][j]))
l[best.index(min(best))].append(list1)

clustero = []
clustert = []

for z in range(k):
array1=[]
array2=[]
first=centroids[z][0]
secound=centroids[z][1]

for v in range(len(l[z])):
array1.append(l[z][v][0][0])
array2.append(l[z][v][0][1])
if (z == 0):#first centroid
clustero.append(l[z][v][0])
elif(z==1):#second centroid
clustert.append(l[z][v][0])
#print(len(clustero))
#print(len(clustert))

## if(first!=sum(array1) / float(len(array1)) or secound!=sum(array2) /

float(len(array2))):

centroids[z]=(sum(array1) / float(len(array1))),
(sum(array2)/float(len(array2)))
#if both centroids don't change we stop
elif(z==1 and first==sum(array1) / float(len(array1)) and
secound==sum(array2) / float(len(array2))):

data = np.array(clustero)
data2 = np.array(clustert)
finalsizec1=len(data)
finalsizec2=len(data2)
x, y = data.T
x1, y1 = data2.T
plt.scatter(x, y)
plt.scatter(x1, y1)
plt.show()
plt.savefig("kmeans.pdf")

change=False

l.clear()

for x in range(k):
l.append([])

## with open("Output.txt", "w") as text_file:

text_file.write("Total costumers: %s \n" % numcostumers)
text_file.write("Nº purchases: %s \n" % numpurchases)
text_file.write("min purchase: %s\n" % minpurchase)
text_file.write("max purchase: %s\n" % maxpurchase)
text_file.write("standard deviation purchases: %s\n" % sd)
text_file.write("Output results \nNumber of clusters: %s\n" % k)
text_file.write("NElements on first cluster: %s\n" % finalsizec1)
text_file.write("NElements on second cluster: %s\n" % finalsizec2)
text_file.write("First centroid: %s\n" % finalsizec1)
text_file.write("Second centroid: %s\n" % finalsizec2)