当前位置: 代码迷 >> C# >> <Machine Learning in Action >之二 朴素无华贝叶斯 C#实现
  详细解决方案

<Machine Learning in Action >之二 朴素无华贝叶斯 C#实现

热度:507   发布时间:2016-05-05 04:32:23.0
<Machine Learning in Action >之二 朴素贝叶斯 C#实现
def trainNB0(trainMatrix,trainCategory):    numTrainDocs = len(trainMatrix)    numWords = len(trainMatrix[0])    pAbusive = sum(trainCategory)/float(numTrainDocs)    p0Num = ones(numWords); p1Num = ones(numWords)      #change to ones()     p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0    for i in range(numTrainDocs):        if trainCategory[i] == 1:            p1Num += trainMatrix[i]            p1Denom += sum(trainMatrix[i])        else:            p0Num += trainMatrix[i]            p0Denom += sum(trainMatrix[i])    p1Vect = log(p1Num/p1Denom)          #change to log()    p0Vect = log(p0Num/p0Denom)          #change to log()    return p0Vect,p1Vect,pAbusive

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult   *提示一    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)    if p1 > p0:        return 1    else:         return 0    


*提示一

p(Ci|w)=p(w|Ci)p(Ci)/p(w)  对乘积取自然对数  ln(p(w|Ci)p(Ci))=ln(p(w|Ci))+ln(p(Ci))

在下面例子中,因为每个分类在样本中的比例都一样的,这样不用再加上log(p(Ci))也不会影响最后的分类效果


用C#随便做个例子,实现文章类型的分类   随机词不如有针对性的词来的有效,所以这里都是从所有三个分类里找到的词汇

1、创建词向量:中超/亚冠/国足/足协/英超/西甲/欧冠/意甲/德甲/篮球/NBA/CBA/高尔夫/乒乓/排球/网球/羽毛球/跑步/赛车/棋牌/台球/游泳/马术/拳击/田径/功夫/扑克/体育/球队/球员/训练/国家队/联赛/俱乐部/场地/翻盘/绝杀/热身/队友/冠军/亚军/季军/犯规/赛季/加时/反超/半场/争夺/战术/阵容/比赛/德比/恢复/进球/失球/奥斯卡/娱乐/影迷/电影/电视/音乐/戏剧/视频/演员/导演/明星/经纪人/歌手/连续剧/展映/粉丝/写真/演技/作秀/节目/艺人/超模/女星/模特/男星/性感/主创/院线/影业/拍摄/编剧/情节/影像/剧情/主演/上映/票房/开机/剧集/表演/收视/预告片/主持人/艾美奖/角色/剧院/乐迷/影迷/演出/专辑/乐坛/剧场/文艺/芭蕾/戏曲/舞蹈/军事/军队/军机/炸弹/军方/坦克/军舰/炸死/军演/战备/部队/军区/国防/士兵/舰船/潜艇/飞机/直升机/舰队/保卫/演习/武器/反击/打击/阅兵/对抗/防卫/海军/空军/陆军/武装/战略/空袭/冲突/装甲/步兵/作战/导弹/边防/侦察/战斗机/雷达/轰炸/防御/据点/火力/航空母舰/进攻/弹药/军营/包围/攻占/俘虏/参战/战友/战斗/入侵


2、搜狐上下载三类文章各10篇组成训练样本,计算出每篇文章的文档矩阵,标注每篇文章的类别标签          

样本文件名格式:  编号_类别标签.txt

文档矩阵:

000000000000000000100000000000000000001100010001001010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000010000000000000100000000000000011110001010000000000000000011000000110000000000100000000000000000010000000000000000000000000000000000000000000000
000000000000000000000000000011000000000000000000000001001000001001000000001000000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000001001000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000010000000000000000000000000000010010000100000000000000010010000001000000000100000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000010000000010000010000010100000000111111111110000000100000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000110000000000000011010000001000010000000000001100001110000000000000000000000000000000000000000000000000000000000000000000000000000
000000000100000000000000000000000000000000000000000000001010000110000000000000000100000001101000000100000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000010000010000000000000001000000001100000100000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000001010000110000000000000000000001011000010000110000000000000000000000000000000000000000000000000000000000000000000
000000010000000000000000000011100000001000010110001001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000
000000000000000000000000000000000000000000000000000000001001000100000000000000000000000010000100000100000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011110000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000110000000111111111111100000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000100000000000000000000000000000000000
000000000000000000000000000000000000000000000001000000000001000000000000000000000000100000000000000000000000000100100000010010000000000000000100000000000100000000000010
000000000000000000000000000000100000000000000000000000000001000000000000000000000000000000000000000000000000000100010000010000000000000000000100000100000000000000000000
000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000110010000000001001010000000010000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000010100000000000100000000010000000000000001000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000010000000000100000000100010000000000001000000000
000000010000000000000000000111001100000000010000001001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000100000000000000100000000110000010000000000000
110000000000000000000000000100001000100000010000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000
110000000000000000000000000111001100100100010001111011000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000
000000000001000000000000000001000000101000100110001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000
000000000000010000000000000000000000001101000001001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000010000000000000000010000000000000010010001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
000000000001000000000000000111100000101000110100001000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000
000000000000000000000000000100000000000110010100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000


类别标签向量:

122222222212333333333131111111

using System;using System.Text;using System.Windows.Forms;using System.IO;namespace NaiveBayes{    public partial class Form1 : Form    {        private string[] vocabArray;        private double[] p0Num, p1Num, p2Num;        public Form1()        {            InitializeComponent();            label2.Text = "体育1、娱乐2、军事3\r\n每个类型10个训练样本\r\n文章全部出自搜狐新闻\r\n词向量从各类文章中分词获得";            StreamReader sr = new StreamReader("vocabList.txt", Encoding.Default);            string line, all = "";            while ((line = sr.ReadLine()) != null)            {                all += line;            }            vocabArray = all.Split(new string[] { "/" }, StringSplitOptions.RemoveEmptyEntries);        }        private void Form1_Resize(object sender, EventArgs e)        {            this.Width = 800;            this.Height = 600;        }        private void button1_Click(object sender, EventArgs e)        {            //生成文档矩阵和分类标签向量            DirectoryInfo di = new DirectoryInfo("train");            FileInfo[] fi = di.GetFiles("*.txt");            string[] trainMatrix = new string[fi.Length];            p0Num = new double[vocabArray.Length];            p1Num = new double[vocabArray.Length];            p2Num = new double[vocabArray.Length];            double p0Denom = 2.0;            double p1Denom = 2.0;            double p2Denom = 2.0;            for (int i = 0; i < vocabArray.Length; i++)            {                p0Num[i] = p1Num[i] = p2Num[i] = 1.0;            }            string trainCategory = "";            int m = 0;            foreach (FileInfo i in fi)            {                StreamReader sr = new StreamReader(i.FullName, Encoding.Default);                string line, all = "";                while ((line = sr.ReadLine()) != null)                {                    all += line;                }                string strVec = "";                foreach (string j in vocabArray)                {                    if (all.Contains(j))                        strVec += "1";                    else                        strVec += "0";                }                trainMatrix[m] = strVec;                m++;                trainCategory += i.Name.Substring(i.Name.LastIndexOf("_") + 1, 1);            }            StreamWriter sw = new StreamWriter(".\\trainV\\trainMatrix.txt", true);            foreach (string i in trainMatrix)            {                sw.WriteLine(i);                sw.Flush();            }            sw.Close();            sw = new StreamWriter(".\\trainV\\trainCategory.txt", true);            sw.WriteLine(trainCategory);            sw.Close();            for (int i = 0; i < trainMatrix.Length; i++)            {                if (trainCategory.Substring(i, 1) == "1")                {                    double tmp = 0;                    for (int j = 0; j < vocabArray.Length; j++)                    {                        p0Num[j] += double.Parse(trainMatrix[i].Substring(j, 1));                        tmp += double.Parse(trainMatrix[i].Substring(j, 1));                    }                    p0Denom += tmp;                }                else if (trainCategory.Substring(i, 1) == "2")                {                    double tmp = 0;                    for (int j = 0; j < vocabArray.Length; j++)                    {                        p1Num[j] += double.Parse(trainMatrix[i].Substring(j, 1));                        tmp += double.Parse(trainMatrix[i].Substring(j, 1));                    }                    p1Denom += tmp;                }                else if (trainCategory.Substring(i, 1) == "3")                {                    double tmp = 0;                    for (int j = 0; j < vocabArray.Length; j++)                    {                        p2Num[j] += double.Parse(trainMatrix[i].Substring(j, 1));                        tmp += double.Parse(trainMatrix[i].Substring(j, 1));                    }                    p2Denom += tmp;                }                else                {                    //Undo                }            }            for (int j = 0; j < vocabArray.Length; j++)            {                p0Num[j] = Math.Log(p0Num[j] / p0Denom);                p1Num[j] = Math.Log(p1Num[j] / p1Denom);                p2Num[j] = Math.Log(p2Num[j] / p2Denom);            }            label4.Text = "处理样本数据完成";        }        private void button2_Click(object sender, EventArgs e)        {            if (textBox1.Text.Trim() != "")            {                string strVec = "";                foreach (string i in vocabArray)                {                    if (textBox1.Text.Contains(i))                        strVec += "1";                    else                        strVec += "0";                }                double p0 = 0;                double p1 = 0;                double p2 = 0;                for (int j = 0; j < vocabArray.Length; j++)                {                    p0 += p0Num[j] * double.Parse(strVec.Substring(j, 1));                    p1 += p1Num[j] * double.Parse(strVec.Substring(j, 1));                    p2 += p2Num[j] * double.Parse(strVec.Substring(j, 1));                }                string catelog = "";                if (p0 > p1 && p0 > p2)                    catelog = "体育";                else if (p1 > p0 && p1 > p2)                    catelog = "娱乐";                else if (p2 > p0 && p2 > p1)                    catelog = "军事";                else                    catelog = "无法判断";                label3.Text = "体育:" + p0.ToString() + "\r\n娱乐:" + p1.ToString() + "\r\n军事:" + p2.ToString();                label1.Text = "所属类型是:" + catelog;            }        }    }}


  相关解决方案