Vous êtes sur la page 1sur 5

cluster australia

May 4, 2016
In [79]: import requests as rq
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import mpld3
mpld3.enable_notebook()
import scipy as sp
import datetime
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
import pickle
In [28]: data = pickle.load(open("aussie_dataset.p", "rb"))
labeled_data = pickle.load(open("labeled_aussie_dataset.p", "rb"))
Split dataset: 1. Training data: 50% 2. Hold-out data: 25% 3. Test data: 25%
In [29]: # split data for approach (1)
# raw data
raw_data = data.values
np.random.shuffle(raw_data)
training_data = raw_data[:5000]
holdout_data = raw_data[5000:7500]
test_data = raw_data[7500:10000]
# raw_data.shape[0] == training_data.shape[0] + holdout_data.shape[0] + test_data.shape[0]
In [30]: # pd.DataFrame(training_data)

Strategy:
1. Cluster training data for given value of k.
2. Use clustered data to classify shapes in our hold-out data.
using RandomForestClassifier
3. As we classify, we add each shape to its computed cluster.
4. Once done classifying, we can calculate the sihlouette index.

We repeat this process for the values of k we are interested in while using the sihlouette index as a metric
to figure out how many clusters we should use.
In [31]: from sklearn.cluster import KMeans
def cluster_training(training_data, K):
1

estimator = KMeans(init="k-means++", n_clusters=K, n_init=10)


estimator.fit(training_data)
labels = estimator.labels_
centroids = pd.DataFrame(estimator.cluster_centers_)
clustered_data = pd.DataFrame(training_data.copy())
return {"data": clustered_data, "labels": labels, "centroids": centroids}

In [ ]:
In [33]: from sklearn.ensemble import RandomForestClassifier
def classify_holdout(training_data, training_labels, classify_data):
clf = RandomForestClassifier(max_depth=100, max_features=24)
clf.fit(training_data, training_labels)
classified_labels = clf.predict(classify_data)
classified_data = pd.DataFrame(classify_data.copy())

return {"training_data": training_data, "classified_data": classified_data, "training_label

In [ ]:
In [ ]:
In [ ]:
In [70]: # compute sihlouette index
from sklearn import metrics
# metrics.silhouette_score(combined_data, combined_labels)
LIMIT = 7500
def compute_silhouette(training_data, holdout_data, K):
cluster_results = cluster_training(training_data, K)
cluster_data = cluster_results["data"]
cluster_labels = cluster_results["labels"]
classify_results = classify_holdout(training_data, cluster_labels, holdout_data)
classify_data = classify_results["classified_data"]
classify_labels = classify_results["classified_labels"]
combine_data = [cluster_data, classify_data]
combined_data = pd.concat(combine_data)
combined_labels = np.concatenate((cluster_labels, classify_labels), axis=0)
if combined_data.shape[0] > LIMIT:
print("LIMIT exceeded: {} > {}".format(combined_data.shape[0], LIMIT))
return
silhouette = metrics.silhouette_score(sample_data, sample_labels)
print("Silhouette index for K = {} is {}".format(K, silhouette))
return silhouette
In [56]: compute_silhouette(training_data, holdout_data, 10)
Calculating silhouette for K = 10
Out[56]: 0.088535622740990402
2

In [71]: # compute silhouette for values of K


K_values = list(range(5, 100, 5))
def plot_silhouette_K_values(training_data, holdout_data, K_values):
silhouette_values = {k: 0 for k in K_values}
for K in K_values:
print("Calculating silhouette for K = {}".format(K))
silhouette_K = compute_silhouette(training_data, holdout_data, K)
silhouette_values[K] = silhouette_K
# prepare data for plotting
X = np.array(list(silhouette_values.keys()))
Y = np.array(list(silhouette_values.values()))
plt.plot(X,Y)
plt.xlabel(K-value)
plt.ylabel(Silhouette Index)
plt.title(Silhouette Index vs. K Values)
return X, Y
In [67]: X, Y = plot_silhouette_K_values(training_data, holdout_data, K_values)
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating
Calculating

silhouette
silhouette
silhouette
silhouette
silhouette
silhouette
silhouette
silhouette
silhouette
silhouette
silhouette
silhouette
silhouette
silhouette
silhouette
silhouette
silhouette
silhouette
silhouette

for
for
for
for
for
for
for
for
for
for
for
for
for
for
for
for
for
for
for

K
K
K
K
K
K
K
K
K
K
K
K
K
K
K
K
K
K
K

=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=

5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95

In [76]: plt.plot(X,Y)
plt.xlabel(K-value)
plt.ylabel(Silhouette Index)
plt.title(Silhouette Index vs. K Values)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

Vous aimerez peut-être aussi