当前位置: 代码迷 >> 综合 >> 机器学习实战之最近邻法分类算分享交流
  详细解决方案

机器学习实战之最近邻法分类算分享交流

热度:103   发布时间:2023-09-27 09:31:03.0

整理并分析了里面一些错误和不适之处,原因是Python版本的更新,目前只是基础部分的代码,后面例子的代码调试分析后再上传,供大家学习交流。

#-*-coding:utf-8-*-
from numpy import *
import operator
import matplotlib.pyplot as plt
def classify0(inX,dataSet,labels,k):'''function would like this:For very point n our dataset:calculate the distance between inX and current pointsort the distance in increasing ordertake k items with lowest disrtance to inXfind the majority calss among these itemsreturn the majority class as our prediction for the class of inX:param inX: the input vector to classify called inX:param dataSet: our full martix of training examples:param labels:  a vector of labels:param k: the number of nearest neighbors to use in the voting:return: sortedClassCount[0][0]'''dataSetsize=dataSet.shape[0]#记录数组第一维的大小#欧几里得距离的公式体现diffMat=tile(inX,(dataSetsize,1))-dataSetsqDiffMat=diffMat**2sqDistance=sqDiffMat.sum(axis=1)#axis=1 列求和distance=sqDistance**0.5#开方sortedDistIndicies=distance.argsort()#元素从小到大排序 提取对对应的indexclassCount={}for i in range(k):#the input k should always be a positive integervoteIlabel=labels[sortedDistIndicies[i]]classCount[voteIlabel]=classCount.get(voteIlabel,0)+1#items返回的是列表对象,而iteritems返回的是iterator对象.#using the itemgetter method from the operator module imported in the second line of the programsortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)return sortedClassCount[0][0]def file2martix(filename):'''datingTestSet.txt including:1000 entriesRecorded the following features:one.Number of frequent flyer miles earned per year.two.Percentage of time spent playing video games.three.Liters of ice cream consumed per week.:param filename: datingTestSet:return: retrunMat,classLabelVector'''fr=open(filename)numberOfLines=len(fr.readlines())returnMat=zeros((numberOfLines,3))#生成numberOfLines*3全零矩阵classLabelVector=[]fr=open(filename)index=0for line in fr.readlines():line=line.strip()#剔除多余的空格listFromline=line.split('\t')#利用'\t'来分隔读取到的linereturnMat[index,:]=listFromline[0:3]#take the first three elements and shove them into a row of matrixclassLabelVector.append(listFromline[-1])#like the integer verison of the last item in the listindex+=1return returnMat,classLabelVectordef plotpicture(dataingMata,classingLabel):fig=plt.figure()ax=fig.add_subplot(111)ax.scatter(dataingMata[:,1],dataingMata[:,2])ax.axis([-2,25,-0.2,2.0])plt.xlabel('Percentage of Time Spent Playing Video Games')plt.ylabel('Listers of Ice Cream Consumed Per Week')plt.show()def autoNorm(dataSet):'''Data-normalizing code:param dataSet: our data martix:return:normDataSet,ranges,minVals'''minVals=dataSet.min(0)#The 0 in dataSet.min(0) allows you to take the minimums from the columnsmaxVals=dataSet.max(0)#Same to the above#the shape of minVals and maxVals is 1*3 and our martix is 1000*3ranges=maxVals-minValsnormDataSet=zeros(shape(dataSet))m=dataSet.shape[0]#tile function to create a martix the same size as our input martix and fill it up with many copies.normDataSet=dataSet-tile(minVals,(m,1))normDataSet=normDataSet/tile(ranges,(m,1))# / operator is element-wise division;linalg.solve(matA,matB) for martix divisionreturn normDataSet,ranges,minValsdef datingClassTest(normMat,datingLabels):'''Classifler testing code for dating site'''hoRatio=0.10m=normMat.shape[0]numTestVecs=int(m*hoRatio)errorCount=0.0for i in range(numTestVecs):classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)print("the classifier came back with: %s, the real answer is: %s"\%(classifierResult,datingLabels[i]))if (classifierResult!=datingLabels[i]):errorCount+=1.0print("the total error rate is: %f" %(errorCount/float(numTestVecs)))def classifyPerson(datingLabels,normMat,minVals,ranges):'''Dating site predictor functions'''resultList=['not at all','in small does','in large doses']percenTats=float(input("percentage of time spent playing video games?"))ffMiles=float(input("frequent flier miles earned per year?"))iceCream=float(input("liters of ice cream consumed per year?"))inArr=array([ffMiles,percenTats,iceCream])classifierResult=classify0((inArr-minVals)/ranges,normMat,datingLabels,3)print("You will probably like this person with:%s",classifierResult)#dataMata,classLabelMat=file2martix('datingTestSet.txt')
#normDataSet,ranges,minVals=autoNorm(dataMata)
#datingClassTest(normDataSet,classLabelMat)
#classifyPerson(classLabelMat,normDataSet,minVals,ranges)
#以上测试过,没有问题.

例子:handwriting recognition 的代码,测试过,没有问题。

#-*- coding:utf-8 -*-
from numpy import *
from os import listdir
import operatordef classify0(inX,dataSet,labels,k):'''function would like this:For very point n our dataset:calculate the distance between inX and current pointsort the distance in increasing ordertake k items with lowest disrtance to inXfind the majority calss among these itemsreturn the majority class as our prediction for the class of inX:param inX: the input vector to classify called inX:param dataSet: our full martix of training examples:param labels:  a vector of labels:param k: the number of nearest neighbors to use in the voting:return: sortedClassCount[0][0]'''dataSetsize=dataSet.shape[0]#记录数组第一维的大小#欧几里得距离的公式体现diffMat=tile(inX,(dataSetsize,1))-dataSetsqDiffMat=diffMat**2sqDistance=sqDiffMat.sum(axis=1)#axis=1 列求和distance=sqDistance**0.5#开方sortedDistIndicies=distance.argsort()#元素从小到大排序 提取对对应的index,默认升序排列.classCount={}for i in range(k):#the input k should always be a positive integervoteIlabel=labels[sortedDistIndicies[i]]classCount[voteIlabel]=classCount.get(voteIlabel,0)+1#items返回的是列表对象.从Python3.5开始使用#using the itemgetter method from the operator module imported in the second line of the program#operator.itemgetter函数获取的不是值,而是定义了一个函数,通过该函数作用到对象上才能获取值#sorted(iterable[, cmp[, key[, reverse]]])sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)return sortedClassCount[0][0]def img2vevtor(filename):returnVect=zeros((1,1024))fr=open(filename)for i in range(32):lineStr=fr.readline()for j in range(32):returnVect[0,32*i+j]=int(lineStr[j])return returnVectdef handwritigClassTest():hwLables=[]trainingFileList=listdir('trainingDigits')m=len(trainingFileList)#1934trainingMat=zeros((m,1024))for i in range(m):fileNameStr=trainingFileList[i]fileStr=fileNameStr.split('.')[0]classNumStr=int(fileStr.split('_')[0])hwLables.append(classNumStr)trainingMat[i,:]=img2vevtor('trainingDigits/%s' %fileNameStr)testFileList=listdir('testDigits')errcount=0.0mTest=len(testFileList)for i in range(mTest):fileNameStr=testFileList[i]fileStr=fileNameStr.split('.')[0]classNumStr=int(fileStr.split('_')[0])vectorUnderTest=img2vevtor('testDigits/%s' %fileNameStr)classifierResult=classify0(vectorUnderTest,trainingMat,hwLables,3)print("the classifier came back with: %d,the real answer is %d"\%(classifierResult,classNumStr))if (classifierResult!=classNumStr):errcount+=1print("\n the total number og errors is: %d" %errcount)print("\n thr total error rate is: %f" %(errcount/float(mTest)))handwritigClassTest()

 

  相关解决方案