由隐马尔科夫意淫无字典中文分词 C_C#

由隐马尔科夫意淫无字典中文分词 C#

using System;using System;using System.Windows.Forms;using System.IO;using System.Text.RegularExpressions;using System.Collections;using System.Collections.Generic;using System.ComponentModel;namespace HMM{    public partial class Form1 : Form    {        string[] arrayData;        DirectoryInfo di;        FileInfo[] fis;        Hashtable htDict = new Hashtable();        double singleCutRate;        public Form1()        {            InitializeComponent();            label1.Text = "先预处理！";            progressBar1.Visible = false;            di = new DirectoryInfo("data");            fis = di.GetFiles("*.txt");            arrayData = new string[fis.Length];        }        private void Form1_Resize(object sender, EventArgs e)        {            this.Width = 800;            this.Height = 600;        }        private void button1_Click(object sender, EventArgs e)        {            if (!new FileInfo("dict.txt").Exists)            {                int count = 0;                progressBar1.Visible = true;                BackgroundWorker worker = new BackgroundWorker();                worker.WorkerReportsProgress = true;  //报告进度                  worker.DoWork += (s, o) =>                {                    int progressCount = 1;                    foreach (FileInfo i in fis)                    {                        StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default);                        arrayData[progressCount - 1] = sr.ReadToEnd();                        sr.Close();                        worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null);                        progressCount++;                    }                    for (int i = 0; i < arrayData.Length; i++)                    {                        arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", "");                        for (int j = 0; j < arrayData[i].Length; j++)                        {                            string strWord = arrayData[i].Substring(j, 1);                            if (IsChinese(strWord))                            {                                if (htDict.ContainsKey(strWord))                                {                                    htDict[strWord] = ((int)htDict[strWord]) + 1;                                }                                else                                {                                    htDict.Add(strWord, 1);                                }                            }                        }                        worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null);                    }                    StreamWriter sw = new StreamWriter("dict.txt", false, System.Text.Encoding.Default);                    foreach (DictionaryEntry i in htDict)                    {                        sw.WriteLine(i.Key + "|" + i.Value);                        count++;                        sw.Flush();                        worker.ReportProgress((int)((double)count / (double)htDict.Count * 33) + 67, null);                    }                    sw.Close();                };                worker.RunWorkerCompleted += (s, o) =>                {                    this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成！|" + count; }));                };                worker.ProgressChanged += (s, o) =>                {                    progressBar1.Style = ProgressBarStyle.Continuous;                    progressBar1.Value = o.ProgressPercentage;                };                worker.RunWorkerAsync();            }            else            {                int count = 0;                progressBar1.Visible = true;                BackgroundWorker worker = new BackgroundWorker();                worker.WorkerReportsProgress = true;  //报告进度                  worker.DoWork += (s, o) =>                {                    int progressCount = 1;                    foreach (FileInfo i in fis)                    {                        StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default);                        arrayData[progressCount - 1] = sr.ReadToEnd();                        sr.Close();                        worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null);                        progressCount++;                    }                    for (int i = 0; i < arrayData.Length; i++)                    {                        arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", "");                        worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null);                    }                    StreamReader reader = new StreamReader("dict.txt", System.Text.Encoding.Default);                    string line = "";                    while ((line = reader.ReadLine()) != null)                    {                        htDict[line.Substring(0, 1)] = line.Substring(2);                        count++;                    }                    reader.Close();                    worker.ReportProgress(100, null);                };                worker.RunWorkerCompleted += (s, o) =>                {                    this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成！|" + count; }));                };                worker.ProgressChanged += (s, o) =>                {                    progressBar1.Style = ProgressBarStyle.Continuous;                    progressBar1.Value = o.ProgressPercentage;                };                worker.RunWorkerAsync();            }        }        private void button2_Click(object sender, EventArgs e)        {            if (label1.Text != "先预处理！" && textBox1.Text.Trim() != "")            {                textBox2.Text = "";                if (!double.TryParse(textBox3.Text.Trim(), out singleCutRate))                {                    singleCutRate = 0.01;                }                List<string> list = new List<string>();                string strSplitWords = Regex.Replace(textBox1.Text.Trim(), @"[^\u4e00-\u9fa5]", "");                int startPos = 0;                int m = 1;                string strWord1 = "";                string strWord2 = "";                progressBar1.Visible = true;                BackgroundWorker worker = new BackgroundWorker();                worker.WorkerReportsProgress = true;  //报告进度                worker.DoWork += (s, o) =>                {                    while (strSplitWords.Length >= 2)                    {                        if (strWord1 == "")                        {                            strWord1 = strSplitWords.Substring(startPos, m);                        }                        strWord2 = strSplitWords.Substring(startPos, ++m);                        double x1 = (double)ReturnCount(strWord1, arrayData);                        double y1 = (double)ReturnTotalCount(strWord1);                        if (y1 == 0)                            y1++;                        double a = x1 / y1;                        double x2 = (double)ReturnCount(strWord2, arrayData);                        double y2 = (double)ReturnTotalCount(strWord2);                        if (y2 == 0)                            y2++;                        double b = x2 / y2;                        if ((a < 1 && a > b) || (a == 1 && b < singleCutRate) || (a == 0 && b == 0))                        {                            list.Add(strWord1);                            startPos += strWord1.Length;                            worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null);                            m = 1;                            strWord1 = "";                            strWord2 = "";                            if ((strSplitWords.Length - startPos) == 1)                            {                                list.Add(strSplitWords.Substring(startPos, 1));                                break;                            }                            else if ((strSplitWords.Length - startPos) < 1)                            {                                break;                            }                        }                        else                        {                            strWord1 = strWord2;                            strWord2 = "";                            if ((strSplitWords.Length - startPos - m) < 1)                            {                                list.Add(strWord1);                                startPos += strWord1.Length;                                worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null);                                break;                            }                        }                    }                    worker.ReportProgress(100, null);                };                worker.RunWorkerCompleted += (s, o) =>         {             this.Invoke(new MethodInvoker(() =>             {                 progressBar1.Visible = false;                 progressBar1.Value = 0;                 foreach (string i in list)                 {                     textBox2.Text += i + "|";                 }                 label2.Text = "分词完成！";             }));         };                worker.ProgressChanged += (s, o) =>                {                    progressBar1.Style = ProgressBarStyle.Continuous;                    progressBar1.Value = o.ProgressPercentage;                };                worker.RunWorkerAsync();            }        }        public bool IsChinese(string str)        {            return Regex.IsMatch(str, @"^[\u4e00-\u9fa5]+$");        }          public int ReturnCount(string s, string[] d)        {            int count = 0;            for (int i = 0; i < d.Length; i++)            {                count += Regex.Matches(d[i], s).Count;            }            return count;        }        public int ReturnTotalCount(string s)        {            int total = 0;            for (int i = 0; i < s.Length; i++)            {                if (htDict.ContainsKey(s.Substring(i, 1)))                {                    total += Convert.ToInt32(htDict[s.Substring(i, 1)]);                }            }            return total;        }    }}

最近在看机器学习方面的书，看到隐马尔科夫，意淫了一下无字典中文分词的可能性，我设想了一种分词方式，并无聊了一个程序，因为执行效率相当差，所以添加了进度条，否则真的等到受不了，仅供参考

1、下载了3200本各类电子书，600多M

2、预先扫描每个字出现的概率P（w）

3、清除待分词内容c中非中文字符

4、从左向右扫描ci，(i为字数,每一位为w1、w2、w3.....wi),

开始时：计算第一个字c1的a=P(c1)/P(w1)和前两个字c2的b=P(c2)/(P(w1)+P(w2))

comp:if ((a < 1 && a > b) || (a == 1 && b < singleCutRate) || (a == 0 && b == 0))

则分割c1,指针向后移动i

否则继续比较前两个字 a=P(c2)/(P(w1)+P(w2))

和前三个字 b=P(c3)/(P(w1)+P(w2)+P(w3)) 的大小循环到comp

注意检测c的尾部并及时跳出循环，singleCutRate用于估算单字的切割概率，比如

“中” 和 “中国” ，当 P(中国)/(P(中)+P(国)) >=0.05 认为“中国”是固有词汇，否则直接分割“中”，这个切割概率需要调教一个合理的数值，似乎最好的区间在0.005上下。

yy完毕!

1楼GJYSK昨天 13:07: 呵呵，有点意思，持续关注博主！