Académique Documents
Professionnel Documents
Culture Documents
import pandas as pd
import numpy as np
ratings_data = pd.read_csv('C:\Users\DELL\Anaconda2\sar.csv')
ratings_data.head()
movie_names = pd.read_csv("C:\Users\DELL\Anaconda2\movies.csv")
movie_names.head()
movie_data.head()
movie_data.groupby('title')['rating'].mean().head()
movie_data.groupby('title')['rating'].mean().sort_values(ascending=False).head()
movie_data.groupby('title')['rating'].count().sort_values(ascending=False).head()
ratings_mean_count = pd.DataFrame(movie_data.groupby('title')['rating'].mean())
ratings_mean_count['rating_counts'] = pd.DataFrame(movie_data.groupby('title')['rating'].count())
sns.set_style('dark')
plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor'] = True
ratings_mean_count['rating_counts'].hist(bins=50)
plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor'] = True
alpha=0.4)
values='rating')
user_movie_rating.head()
forrest_gump_ratings.head()
movies_like_forest_gump = user_movie_rating.corrwith(forrest_gump_ratings)
corr_forrest_gump = pd.DataFrame(movies_like_forest_gump,
columns=['Correlation'])
corr_forrest_gump.dropna(inplace=True)
corr_forrest_gump.head()
corr_forrest_gump.sort_values('Correlation', ascending=False).head(10)
corr_forrest_gump = corr_forrest_gump.join(ratings_mean_count['rating_counts'])
corr_forrest_gump.head()
corr_forrest_gump[corr_forrest_gump
['rating_counts']>50].sort_values('Correlation', ascending=False).head()
Page Rank
import numpy as np
a = np.array([[0,0,0,0,0,0,0,0,0,0,0,0],[0,1,0,0,0,0,0,0,0,0,0,0],[0,0,0,1/2,1/2,0,0,0,0,0,0,0],
[0,0,0,0,1,0,0,0,0,0,0,0],[0,0,0,0,0,1,0,0,0,1,0,0],
[0,0,0,0,0,0,1,0,0,0,0,0],[0,0,0,0,0,0,0,2/3,2/3,0,0,2/3],
[0,0,0,0,0,0,0,0,1,0,0,0],[0,0,0,0,0,0,0,0,0,2/3,2/3,2/3],
[0,1,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,1,0,0,0,0,1],
[0,0,0,0,0,0,0,0,0,0,0,0]])
n = 15
d = 0.85
d_matrix = np.array([[d]]*a.shape[1])
a_trans = a.T
for i in range(n):
a1 = np.matmul(a_trans, d_matrix)
d_matrix = a1
print(d_matrix)
2.
import numpy as np
a = np.array([[0,0,0.5,0,0,0,0,0.5,0,0],[0.5,0,0,0,0,0,0.5,0,0,0],[0,0,0,1,0,0,0,0,0,0],
[0,0,0,0,0,1,0,0,0,0],[0.5,0,0,0,0,0,0,0,0.5,0],
[0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,1],
[0,0,0,0,0,0,0,0,1,0],
[0,0,0,0,0,1,0,0,0,0],[0,0,0,0,0,0,0,1,0,0]])
n = 15
d = 0.85
d_matrix = np.array([[d]]*a.shape[1])
a_trans = a.T
for i in range(n):
a1 = np.matmul(a_trans, d_matrix)
d_matrix = a1
print(d_matrix)
k-means
import pandas as pd
import pylab as pl
variables = pd.read_csv('C:\Users\DELL\Anaconda2\sar.csv')
Y = variables[['movieId']]
X = variables[['rating']]
StartJunction = variables[['timestamp']]
EndJunction = variables[['userId']]
Nc = range(1, 20)
kmeans
score
pl.plot(Nc,score)
pl.xlabel('Number of Clusters')
pl.ylabel('Score')
pl.title('Elbow Curve')
pca = PCA(n_components=1).fit(Y_norm)
pca_d = pca.transform(Y_norm)
pca_c = pca.transform(X_norm)
kmeans=KMeans(n_clusters=3)
kmeansoutput=kmeans.fit(Y_norm)
kmeansoutput
pl.xlabel('Rating')
pl.ylabel('UserId')
pl.show()
Agglomerative clustering
import numpy as np
style.use("ggplot")
centers = [[1,1,1],[5,5,5],[3,10,10]]
ms = MeanShift()
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
print(cluster_centers)
n_clusters_ = len(np.unique(labels))
print("Number of estimated clusters:", n_clusters_)
colors = 10*['r','g','b','c','k','y','m']
print(colors)
print(labels)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for i in range(len(X)):
ax.scatter(cluster_centers[:,0],cluster_centers[:,1],cluster_centers[:,2],
plt.show()
Divisive Clustering
import pandas as pd
num_clusters = 0
mat = np.array([[0,2,6,10,9],[2,0,5,9,8],[6,5,0,4,5],[10,9,4,0,3],[9,8,5,3,0]])
all_elements = ['a','b','c','d','e']
max_diameter = -np.inf
sum_dissm = 0
for i in element_list:
sum_dissm += dissimilarity_matrix[ele][i]
max_diameter = dissimilarity_matrix[ele][i]
if(len(element_list)>1):
avg = sum_dissm/(len(element_list)-1)
else:
avg = 0
return avg
def avg_dissim_across_group_element(ele, main_list, splinter_list):
if len(splinter_list) == 0:
return 0
sum_dissm = 0
for j in splinter_list:
avg = sum_dissm/(len(splinter_list))
return avg
most_dissm_object_value = -np.inf
most_dissm_object_index = None
x = avg_dissim_within_group_element(ele, main_list)
diff= x -y
most_dissm_object_value = diff
most_dissm_object_index = ele
if(most_dissm_object_value>0):
return (most_dissm_object_index, 1)
else:
def split(element_list):
main_list = element_list
splinter_group = []
main_list.remove(most_dissm_object_index)
splinter_group.append(most_dissm_object_index)
def max_diameter(cluster_list):
max_diameter_cluster_index = None
max_diameter_cluster_value = -np.inf
index = 0
for i in element_list:
for j in element_list:
max_diameter_cluster_value = dissimilarity_matrix[i][j]
max_diameter_cluster_index = index
index +=1
return -1
return max_diameter_cluster_index
current_clusters = ([all_elements])
level = 1
index = 0
while(index!=-1):
print(level, current_clusters)
del current_clusters[index]
current_clusters.append(a_clstr)
current_clusters.append(b_clstr)
index = max_diameter(current_clusters)
level +=1
print(level, current_clusters)
import pandas as pd
import numpy as np
sale2.info()
sale2
sale2.isnull().sum()
mean=sale2['Order Quantity'].mean()
mean
sale2.isnull().sum()
sale2['Sales']=sale2['Sales'].fillna(mean)
sale2.info()
sale2.isnull().sum()
sale2['profit']=sale2['profit'].fillna(mean)
sale2.isnull().sum()
sale2
Preprocessing
.format(sale2_std[:,0].mean(), sale2_std[:,1].mean()))
.format(sale2_std[:,0].std(), sale2_std[:,1].std()))
.format(sale2_minmax[:,0].max(), sale2_minmax[:,1].max()))
dec=sale2['Sales'].mean()
Decimal Scaling
sar=dec/10000
sar
dec=sale2['Sales']
dec1=salae2['Order Quantity']
sar=dec/10000
sar1=dec1/100
sar1
Apriori
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
df
from mlxtend.frequent_patterns import apriori
apriori(df, min_support=0.6)
apriori(df, min_support=0.6, use_colnames=True)
Decision Tree
def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine
headerTable = {}
del(headerTable[k])
freqItemSet = set(headerTable.keys())
if len(freqItemSet) == 0: return None, None #if no items meet min support -->get out
for k in headerTable:
localD = {}
if item in freqItemSet:
localD[item] = headerTable[item][0]
if len(localD) > 0:
headerTable[items[0]][1] = inTree.children[items[0]]
else:
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
while (nodeToTest.nodeLink != None): #Do not use recursion to traverse a linked list!
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode
def loadSimpDat():
['z'],
return simpDat
def createInitSet(dataSet):
retDict = {}
retDict[frozenset(trans)] = 1
return retDict
simpDat = loadSimpDat()
simpDat
initSet = createInitSet(simpDat)
initSet
dataset = pandas.read_csv(url, names=names)
data = load_iris()
array([0, 0, 1])
>>> list(data.target_names)
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names =
feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('diabetes.png')
Image(graph.create_png())