Sklearn 聚类分析 kmeans,DBSCAN_综合

KMenas算法

#导入KMeans模块
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

#随机生成含150个类别数为3的数据集
X,y=make_blobs(n_samples=150,centers=3,random_state=8)
#数据可视化
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(dpi=100)
plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.spring,edgecolors='k')

<matplotlib.collections.PathCollection at 0x24dbd2a2dd8>

在这里插入图片描述

#模块实例化
kmeans=KMeans(n_clusters=3)

#模型训练
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',random_state=None, tol=0.0001, verbose=0)

import numpy as np
from matplotlib.colors import ListedColormap
#定义图像中分区的颜色和散点的颜色
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
#分别用样本的两个特征值创建图像的横轴和纵轴
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),np.arange(y_min, y_max, .02))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
#给每个分类中的样本分配不同的颜色
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
#用散点把样本表示出来
plt.scatter(X[:, 0], X[:, 1], c=y, cmap = plt.cm.spring, edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("K-Means Cluster")

在这里插入图片描述

Text(0.5, 1.0, 'K-Means Cluster')

#查看聚类中心点的坐标
kmeans.cluster_centers_

array([[ 7.51338019,  9.44881625],[-5.43790266, -9.83963795],[ 7.21711781,  0.68887741]])

#将数据中心点可视化
import numpy as np
from matplotlib.colors import ListedColormap
#定义图像中分区的颜色和散点的颜色
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
#分别用样本的两个特征值创建图像的横轴和纵轴
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),np.arange(y_min, y_max, .02))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
#给每个分类中的样本分配不同的颜色
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
#用散点把样本表示出来
plt.scatter(X[:, 0], X[:, 1], c=y, cmap = plt.cm.spring, edgecolor='k', s=20)
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=200,marker='*',c='red',label='centroids')
plt.legend()
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("K-Means Cluster")

在这里插入图片描述

Text(0.5, 1.0, 'K-Means Cluster')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vki9Yroa-1645627108907)(output_7_2.png)]

#查看数据标签
kmeans.labels_

array([0, 2, 1, 0, 1, 1, 2, 1, 2, 1, 1, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 0,2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 2, 0, 2, 0, 1, 1, 2, 2, 1, 1, 1,2, 2, 0, 0, 0, 1, 2, 2, 1, 1, 0, 2, 1, 2, 1, 0, 0, 2, 2, 0, 0, 2,0, 2, 2, 0, 0, 0, 0, 1, 0, 2, 2, 1, 0, 0, 0, 1, 1, 1, 2, 2, 1, 0,1, 2, 2, 1, 1, 2, 0, 1, 0, 1, 2, 2, 0, 0, 0, 2, 1, 1, 2, 1, 0, 2,1, 0, 0, 1, 2, 0, 1, 2, 2, 1, 1, 1, 1, 0, 0, 0, 1, 2, 2, 2, 0, 1,0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 2, 1, 1, 2, 2, 1, 1, 2])

#查看运行时迭代的次数
kmeans.n_iter_

#查看各样本与其最近的类中心的距离之和，簇内误差平方和
kmeans.inertia_

329.18213511898057

#肘方法，确定k值
distortion=[]
for i in range(1,20):km=KMeans(n_clusters=i,init='k-means++',random_state=8)km.fit(X)distortion.append(km.inertia_)

plt.plot(range(1, 20), distortion, marker='o')
plt.xticks(range(1, 20))
plt.grid(linestyle='--')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')

Text(0, 0.5, 'Distortion')

在这里插入图片描述

#计算并绘制轮廓系数
km=KMeans(n_clusters=2)
y_km=km.fit_predict(X)
from matplotlib import cm
from sklearn.metrics import silhouette_samples
cluster_labels=np.unique(y_km)
n_clusters=cluster_labels.shape[0]
silhouette_vals=silhouette_samples(X,y_km,metric='euclidean')

y_ax_lower,y_ax_upper= 0,0
yticks = []
for i ,c in enumerate(cluster_labels):c_silhouette_vals = silhouette_vals[y_km == c]c_silhouette_vals.sort()y_ax_upper += len(c_silhouette_vals)color = cm. jet(i/n_clusters)plt.barh(range(y_ax_lower, y_ax_upper),c_silhouette_vals,height=1.0,edgecolor='none',color=color)yticks.append((y_ax_lower+y_ax_upper)/2)y_ax_lower += len(c_silhouette_vals)silhouette_avg = np.mean(silhouette_vals)plt.axvline(silhouette_avg,color='red',linestyle='--')
plt.ylabel("Cluster")
plt.xlabel("silhouette coefficient")

Text(0.5, 0, 'silhouette coefficient')

在这里插入图片描述

DBSCAN算法

from sklearn.cluster import DBSCAN

#生成模拟数据集
from sklearn.datasets import make_moons
X,y=make_moons(n_samples=200,noise=0.05,random_state=8)

#数据可视化
plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.spring,edgecolors='k')

<matplotlib.collections.PathCollection at 0x24dfc0ddd68>

在这里插入图片描述

#利用K-Means算法聚类
#模型实例化
kmeans=KMeans(n_clusters=2,init='random')

#模型训练与预测
y_km=kmeans.fit_predict(X)

#将数据中心点可视化
import numpy as np
from matplotlib.colors import ListedColormap
#定义图像中分区的颜色和散点的颜色
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
#分别用样本的两个特征值创建图像的横轴和纵轴
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),np.arange(y_min, y_max, .02))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
#给每个分类中的样本分配不同的颜色
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
#用散点把样本表示出来
plt.scatter(X[:, 0], X[:, 1], c=y, cmap = plt.cm.spring, edgecolor='k', s=20)
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=200,marker='*',c='red',label='centroids')
plt.legend()
plt.grid()
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("K-Means Cluster")

在这里插入图片描述

Text(0.5, 1.0, 'K-Means Cluster')

#结果可视化
plt.scatter(X[y_km==0,0],X[y_km==0,1],c='red',marker='o',s=40,label='cluster 1',edgecolor='k')
plt.scatter(X[y_km==1,0],X[y_km==1,1],c='green',marker='s',s=40,label='cluster 2',edgecolor='k')
plt.title('K-means clustering')
plt.legend()

<matplotlib.legend.Legend at 0x24dfc0a1c88>

在这里插入图片描述

#利用DBSCAN聚类
#导入DBSCAN模块
from sklearn.cluster import DBSCAN
#模型实例化
db=DBSCAN(eps=0.2,min_samples=5)
#训练模型并预测
y_db=db.fit_predict(X)

#DBSCAN结果可视化
plt.scatter(X[y_km==0,0],X[y_km==0,1],c='red',marker='o',s=40,label='cluster 1',edgecolor='k')
plt.scatter(X[y_km==1,0],X[y_km==1,1],c='green',marker='s',s=40,label='cluster 2',edgecolor='k')
plt.title('DBSCAN clustering')
plt.legend()

<matplotlib.legend.Legend at 0x24dfc5b6320>

! 在这里插入图片描述

%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
dbscan_data=pd.read_csv(r'C:\Users\86139\Desktop\dbscan_data.txt')
dbscan_data.head()
plt.scatter(dbscan_data['x1'],dbscan_data['x2'])

<matplotlib.collections.PathCollection at 0x24dfe0f40b8>

import numpy as np
from sklearn.cluster import DBSCAN
db=DBSCAN(eps=0.1,min_samples=10)
db.fit(dbscan_data)

DBSCAN(algorithm='auto', eps=0.1, leaf_size=30, metric='euclidean',metric_params=None, min_samples=10, n_jobs=None, p=None)

db.labels_

array([0, 0, 0, ..., 2, 1, 1], dtype=int64)

labels6=db.labels_
dbscan_data['cluster_db']=labels6
colors=np.array(['red','green','blue','yellow','teal','orange','cyan','black','goldenrod','tomato'])
plt.figure(figsize=(10,8))
plt.scatter(dbscan_data['x1'],dbscan_data['x2'],c=colors[dbscan_data['cluster_db']])

<matplotlib.collections.PathCollection at 0x24dfe148400>

在这里插入图片描述

层次聚类

seeds_df=pd.read_csv(r'C:\Users\86139\Desktop\seeds-less-rows.csv')
seeds_df.head()

	area	perimeter	compactness	length	width	asymmetry_coefficient	groove_length	grain_variety
0	14.88	14.57	0.8811	5.554	3.333	1.018	4.956	Kama wheat
1	14.69	14.49	0.8799	5.563	3.259	3.586	5.219	Kama wheat
2	14.03	14.16	0.8796	5.438	3.201	1.717	5.001	Kama wheat
3	13.99	13.83	0.9183	5.119	3.383	5.234	4.781	Kama wheat
4	14.11	14.26	0.8722	5.520	3.168	2.688	5.219	Kama wheat

seeds_df.grain_variety.value_counts()

Kama wheat        14
Rosa wheat        14
Canadian wheat    14
Name: grain_variety, dtype: int64

varieties=list(seeds_df.pop('grain_variety'))
samples=seeds_df.values

samples

array([[14.88  , 14.57  ,  0.8811,  5.554 ,  3.333 ,  1.018 ,  4.956 ],[14.69  , 14.49  ,  0.8799,  5.563 ,  3.259 ,  3.586 ,  5.219 ],[14.03  , 14.16  ,  0.8796,  5.438 ,  3.201 ,  1.717 ,  5.001 ],[13.99  , 13.83  ,  0.9183,  5.119 ,  3.383 ,  5.234 ,  4.781 ],[14.11  , 14.26  ,  0.8722,  5.52  ,  3.168 ,  2.688 ,  5.219 ],[13.02  , 13.76  ,  0.8641,  5.395 ,  3.026 ,  3.373 ,  4.825 ],[15.49  , 14.94  ,  0.8724,  5.757 ,  3.371 ,  3.412 ,  5.228 ],[16.2   , 15.27  ,  0.8734,  5.826 ,  3.464 ,  2.823 ,  5.527 ],[13.5   , 13.85  ,  0.8852,  5.351 ,  3.158 ,  2.249 ,  5.176 ],[15.36  , 14.76  ,  0.8861,  5.701 ,  3.393 ,  1.367 ,  5.132 ],[15.78  , 14.91  ,  0.8923,  5.674 ,  3.434 ,  5.593 ,  5.136 ],[14.46  , 14.35  ,  0.8818,  5.388 ,  3.377 ,  2.802 ,  5.044 ],[11.23  , 12.63  ,  0.884 ,  4.902 ,  2.879 ,  2.269 ,  4.703 ],[14.34  , 14.37  ,  0.8726,  5.63  ,  3.19  ,  1.313 ,  5.15  ],[16.84  , 15.67  ,  0.8623,  5.998 ,  3.484 ,  4.675 ,  5.877 ],[17.32  , 15.91  ,  0.8599,  6.064 ,  3.403 ,  3.824 ,  5.922 ],[18.72  , 16.19  ,  0.8977,  6.006 ,  3.857 ,  5.324 ,  5.879 ],[18.88  , 16.26  ,  0.8969,  6.084 ,  3.764 ,  1.649 ,  6.109 ],[18.76  , 16.2   ,  0.8984,  6.172 ,  3.796 ,  3.12  ,  6.053 ],[19.31  , 16.59  ,  0.8815,  6.341 ,  3.81  ,  3.477 ,  6.238 ],[17.99  , 15.86  ,  0.8992,  5.89  ,  3.694 ,  2.068 ,  5.837 ],[18.85  , 16.17  ,  0.9056,  6.152 ,  3.806 ,  2.843 ,  6.2   ],[19.38  , 16.72  ,  0.8716,  6.303 ,  3.791 ,  3.678 ,  5.965 ],[18.96  , 16.2   ,  0.9077,  6.051 ,  3.897 ,  4.334 ,  5.75  ],[18.14  , 16.12  ,  0.8772,  6.059 ,  3.563 ,  3.619 ,  6.011 ],[18.65  , 16.41  ,  0.8698,  6.285 ,  3.594 ,  4.391 ,  6.102 ],[18.94  , 16.32  ,  0.8942,  6.144 ,  3.825 ,  2.908 ,  5.949 ],[17.36  , 15.76  ,  0.8785,  6.145 ,  3.574 ,  3.526 ,  5.971 ],[13.32  , 13.94  ,  0.8613,  5.541 ,  3.073 ,  7.035 ,  5.44  ],[11.43  , 13.13  ,  0.8335,  5.176 ,  2.719 ,  2.221 ,  5.132 ],[12.01  , 13.52  ,  0.8249,  5.405 ,  2.776 ,  6.992 ,  5.27  ],[11.34  , 12.87  ,  0.8596,  5.053 ,  2.849 ,  3.347 ,  5.003 ],[12.02  , 13.33  ,  0.8503,  5.35  ,  2.81  ,  4.271 ,  5.308 ],[12.44  , 13.59  ,  0.8462,  5.319 ,  2.897 ,  4.924 ,  5.27  ],[11.55  , 13.1   ,  0.8455,  5.167 ,  2.845 ,  6.715 ,  4.956 ],[11.26  , 13.01  ,  0.8355,  5.186 ,  2.71  ,  5.335 ,  5.092 ],[12.46  , 13.41  ,  0.8706,  5.236 ,  3.017 ,  4.987 ,  5.147 ],[11.81  , 13.45  ,  0.8198,  5.413 ,  2.716 ,  4.898 ,  5.352 ],[11.27  , 12.86  ,  0.8563,  5.091 ,  2.804 ,  3.985 ,  5.001 ],[12.79  , 13.53  ,  0.8786,  5.224 ,  3.054 ,  5.483 ,  4.958 ],[12.67  , 13.32  ,  0.8977,  4.984 ,  3.135 ,  2.3   ,  4.745 ],[11.23  , 12.88  ,  0.8511,  5.14  ,  2.795 ,  4.325 ,  5.003 ]])

%matplotlib inline
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
#进行层次聚类
mergings = linkage(samples, method='complete')

#树状图结果
plt.figure(figsize=(20,8))
dendrogram(mergings,labels=varieties,leaf_rotation=50,leaf_font_size=13,orientation='top')
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

在这里插入图片描述

#聚类模型评估


from scipy.cluster.hierarchy import fcluster
labels=fcluster(mergings,6,criterion='distance')
df=pd.DataFrame({
    'labels':labels,'varirties':varieties})
ct=pd.crosstab(df['labels'],df['varirties'])

from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(df['labels'],df['varirties'])

0.800622446994748

from sklearn.metrics import mutual_info_score
mutual_info_score(df['labels'],df['varirties'])

0.9099935418029258

#轮廓系数
from sklearn import metrics
score=metrics.silhouette_score(X,kmeans.labels_)
print(score)

0.4935017224074542