当前位置: 代码迷 >> 综合 >> 深度学习花书 笔记 4 - K-means 算法流程与实现
  详细解决方案

深度学习花书 笔记 4 - K-means 算法流程与实现

热度:27   发布时间:2024-01-05 03:06:30.0

深度学习花书 笔记4-Kmeans算法流程与实现

      • 1. K-means 算法流程
      • 2. K-means 编程实现

1. K-means 算法流程

在这里插入图片描述

2. K-means 编程实现

Kmeans_ 为快速版,Kmeans 为普通版

''' Created on Apr 11, 2020 Author: yali '''
import os, sys
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import timedef LoadDataSet(fileName, delim='\t'):fr = open(fileName)stringArr = [line.strip().split(delim) for line in fr.readlines()]datArr = [list(map(float, line)) for line in stringArr]return np.mat(datArr)def RandomClusterCenter(data, k):# 为待聚类的点初始化聚类中心m, n = data.shapecenters = np.mat(np.zeros((k,n)))for j in range(n):min_value = np.min(data[:, j])max_value = np.max(data[:, j])range_value = float(max_value - min_value)centers[:,j] = np.mat(min_value + range_value * np.random.rand(k,1))return centersdef ComputeDistance(data, centers):m, n = data.shapedistances = np.tile(centers, (m, 1)) - datadistances = np.multiply(distances, distances)distances = distances.sum(axis=1)distances = np.sqrt(distances)return distancesdef Kmeans(data, k):m, n = data.shape# 初始化聚类类别, 第一列存储分类标签,第二列存储分类中心与样本点的距离误差cluster = np.mat(np.zeros((m,2)))centers = RandomClusterCenter(data, k)  # 初始化聚类中心点cluster_changed = Truewhile cluster_changed:cluster_changed = Falsefor i in range(m):min_distace = np.inf; # 无限大的正数min_cluster = -1   # 类别标签:0,1for j in range(k):distance = ComputeDistance(centers[j,:],data[i,:])  # 计算中心与样本点之间的距离之和if distance < min_distace:  # 选择距离最小的作为类别min_distace = distancemin_cluster = j  # 标识类别if cluster[i,0] != min_cluster:  # 如果样本点聚类的类别更新了,说明还需进行下一次的迭代cluster_changed = Truecluster[i,:] = min_cluster,min_distace**2for cent in range(k):#recalculate centroidscluster_points = data[np.nonzero(cluster[:,0].A==cent)[0]] # 获取每个类别对应的所有样本点centers[cent,:] = np.mean(cluster_points, axis=0) # 计算每个类别样本点的均值作为新的聚类中心点return centers, clusterdef Kmeans_(data, k):m, n = data.shape# 初始化聚类类别, 第一列存储分类标签,第二列存储分类中心与样本点的距离误差cluster = np.mat(np.zeros((m,2)))min_distace = np.multiply(np.mat(np.ones((m,1))) , np.inf)min_cluster = np.multiply(np.mat(np.zeros((m,1))) , -1)centers = RandomClusterCenter(data, k)  # 初始化聚类中心点cluster_changed = Truewhile cluster_changed:cluster_changed = Falsefor j in range(k):distance = ComputeDistance(centers[j,:],data)  # 计算中心与样本点之间的距离之和min_value = np.min(distance, axis=1)dist, index = np.where(min_value < min_distace)min_distace[dist] = min_value[dist]min_cluster[dist] = jif np.sum(np.abs(cluster[:,0] - min_cluster)) != 0:  # 如果样本点聚类的类别更新了,说明还需进行下一次的迭代cluster_changed = Truecluster[:, 0] = min_clustercluster[:, 1] = np.multiply(min_distace, min_distace)for cent in range(k):cluster_points = data[np.nonzero(cluster[:,0].A==cent)[0]] # 获取每个类别对应的所有样本点centers[cent,:] = np.mean(cluster_points, axis=0) # 计算每个类别样本点的均值作为新的聚类中心点return centers, clusterdef DisplayData(data):figure = plt.figure()ax = figure.add_subplot(111)for i in range(len(data)):if data[i, 2] == 0:ax.scatter(data[i, 0], data[i, 1], marker='o', s=80, c='green')if data[i, 2] == 1:ax.scatter(data[i, 0], data[i, 1], marker='^', s=80, c='red')if __name__ == '__main__':data = LoadDataSet('data.txt')start_time = time.clock()centers, clusterAssment = Kmeans_(data[:, 0:2], 2)  # 快速版end_time = time.clock()print("Kmeans_ Execution Time: {:.8f}s ".format(end_time - start_time))start_time = time.clock()centers, clusterAssment = Kmeans(data[:, 0:2], 2)end_time = time.clock()print("Kmeans Execution Time: {:.8f}s ".format(end_time - start_time))DisplayData(data)plt.plot(centers[:, 0], centers[:, 1], '+', color='black', markersize=40)plt.show()

下面为测试结果:
在这里插入图片描述

  相关解决方案