当前位置: 代码迷 >> C# >> 由隐马尔科夫意淫无字典中文分词 C
  详细解决方案

由隐马尔科夫意淫无字典中文分词 C

热度:69   发布时间:2016-05-05 04:15:15.0
由隐马尔科夫意淫无字典中文分词 C#
using System;using System;using System.Windows.Forms;using System.IO;using System.Text.RegularExpressions;using System.Collections;using System.Collections.Generic;using System.ComponentModel;namespace HMM{    public partial class Form1 : Form    {        string[] arrayData;        DirectoryInfo di;        FileInfo[] fis;        Hashtable htDict = new Hashtable();        double singleCutRate;        public Form1()        {            InitializeComponent();            label1.Text = "先预处理!";            progressBar1.Visible = false;            di = new DirectoryInfo("data");            fis = di.GetFiles("*.txt");            arrayData = new string[fis.Length];        }        private void Form1_Resize(object sender, EventArgs e)        {            this.Width = 800;            this.Height = 600;        }        private void button1_Click(object sender, EventArgs e)        {            if (!new FileInfo("dict.txt").Exists)            {                int count = 0;                progressBar1.Visible = true;                BackgroundWorker worker = new BackgroundWorker();                worker.WorkerReportsProgress = true;  //报告进度                  worker.DoWork += (s, o) =>                {                    int progressCount = 1;                    foreach (FileInfo i in fis)                    {                        StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default);                        arrayData[progressCount - 1] = sr.ReadToEnd();                        sr.Close();                        worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null);                        progressCount++;                    }                    for (int i = 0; i < arrayData.Length; i++)                    {                        arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", "");                        for (int j = 0; j < arrayData[i].Length; j++)                        {                            string strWord = arrayData[i].Substring(j, 1);                            if (IsChinese(strWord))                            {                                if (htDict.ContainsKey(strWord))                                {                                    htDict[strWord] = ((int)htDict[strWord]) + 1;                                }                                else                                {                                    htDict.Add(strWord, 1);                                }                            }                        }                        worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null);                    }                    StreamWriter sw = new StreamWriter("dict.txt", false, System.Text.Encoding.Default);                    foreach (DictionaryEntry i in htDict)                    {                        sw.WriteLine(i.Key + "|" + i.Value);                        count++;                        sw.Flush();                        worker.ReportProgress((int)((double)count / (double)htDict.Count * 33) + 67, null);                    }                    sw.Close();                };                worker.RunWorkerCompleted += (s, o) =>                {                    this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成!|" + count; }));                };                worker.ProgressChanged += (s, o) =>                {                    progressBar1.Style = ProgressBarStyle.Continuous;                    progressBar1.Value = o.ProgressPercentage;                };                worker.RunWorkerAsync();            }            else            {                int count = 0;                progressBar1.Visible = true;                BackgroundWorker worker = new BackgroundWorker();                worker.WorkerReportsProgress = true;  //报告进度                  worker.DoWork += (s, o) =>                {                    int progressCount = 1;                    foreach (FileInfo i in fis)                    {                        StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default);                        arrayData[progressCount - 1] = sr.ReadToEnd();                        sr.Close();                        worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null);                        progressCount++;                    }                    for (int i = 0; i < arrayData.Length; i++)                    {                        arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", "");                        worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null);                    }                    StreamReader reader = new StreamReader("dict.txt", System.Text.Encoding.Default);                    string line = "";                    while ((line = reader.ReadLine()) != null)                    {                        htDict[line.Substring(0, 1)] = line.Substring(2);                        count++;                    }                    reader.Close();                    worker.ReportProgress(100, null);                };                worker.RunWorkerCompleted += (s, o) =>                {                    this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成!|" + count; }));                };                worker.ProgressChanged += (s, o) =>                {                    progressBar1.Style = ProgressBarStyle.Continuous;                    progressBar1.Value = o.ProgressPercentage;                };                worker.RunWorkerAsync();            }        }        private void button2_Click(object sender, EventArgs e)        {            if (label1.Text != "先预处理!" && textBox1.Text.Trim() != "")            {                textBox2.Text = "";                if (!double.TryParse(textBox3.Text.Trim(), out singleCutRate))                {                    singleCutRate = 0.01;                }                List<string> list = new List<string>();                string strSplitWords = Regex.Replace(textBox1.Text.Trim(), @"[^\u4e00-\u9fa5]", "");                int startPos = 0;                int m = 1;                string strWord1 = "";                string strWord2 = "";                progressBar1.Visible = true;                BackgroundWorker worker = new BackgroundWorker();                worker.WorkerReportsProgress = true;  //报告进度                worker.DoWork += (s, o) =>                {                    while (strSplitWords.Length >= 2)                    {                        if (strWord1 == "")                        {                            strWord1 = strSplitWords.Substring(startPos, m);                        }                        strWord2 = strSplitWords.Substring(startPos, ++m);                        double x1 = (double)ReturnCount(strWord1, arrayData);                        double y1 = (double)ReturnTotalCount(strWord1);                        if (y1 == 0)                            y1++;                        double a = x1 / y1;                        double x2 = (double)ReturnCount(strWord2, arrayData);                        double y2 = (double)ReturnTotalCount(strWord2);                        if (y2 == 0)                            y2++;                        double b = x2 / y2;                        if ((a < 1 && a > b) || (a == 1 && b < singleCutRate) || (a == 0 && b == 0))                        {                            list.Add(strWord1);                            startPos += strWord1.Length;                            worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null);                            m = 1;                            strWord1 = "";                            strWord2 = "";                            if ((strSplitWords.Length - startPos) == 1)                            {                                list.Add(strSplitWords.Substring(startPos, 1));                                break;                            }                            else if ((strSplitWords.Length - startPos) < 1)                            {                                break;                            }                        }                        else                        {                            strWord1 = strWord2;                            strWord2 = "";                            if ((strSplitWords.Length - startPos - m) < 1)                            {                                list.Add(strWord1);                                startPos += strWord1.Length;                                worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null);                                break;                            }                        }                    }                    worker.ReportProgress(100, null);                };                worker.RunWorkerCompleted += (s, o) =>         {             this.Invoke(new MethodInvoker(() =>             {                 progressBar1.Visible = false;                 progressBar1.Value = 0;                 foreach (string i in list)                 {                     textBox2.Text += i + "|";                 }                 label2.Text = "分词完成!";             }));         };                worker.ProgressChanged += (s, o) =>                {                    progressBar1.Style = ProgressBarStyle.Continuous;                    progressBar1.Value = o.ProgressPercentage;                };                worker.RunWorkerAsync();            }        }        public bool IsChinese(string str)        {            return Regex.IsMatch(str, @"^[\u4e00-\u9fa5]+$");        }          public int ReturnCount(string s, string[] d)        {            int count = 0;            for (int i = 0; i < d.Length; i++)            {                count += Regex.Matches(d[i], s).Count;            }            return count;        }        public int ReturnTotalCount(string s)        {            int total = 0;            for (int i = 0; i < s.Length; i++)            {                if (htDict.ContainsKey(s.Substring(i, 1)))                {                    total += Convert.ToInt32(htDict[s.Substring(i, 1)]);                }            }            return total;        }    }}




最近在看机器学习方面的书,看到隐马尔科夫,意淫了一下无字典中文分词的可能性,我设想了一种分词方式,并无聊了一个程序,因为执行效率相当差,所以添加了进度条,否则真的等到受不了,仅供参考

1、下载了3200本各类电子书,600多M

2、预先扫描每个字出现的概率P(w)

3、待分词内容c中非中文字符

4、从左向右扫描ci,(i为字数,每一位为w1、w2、w3.....wi),

       开始时:计算第一个字c1的a=P(c1)/P(w1)和前两个字c2的b=P(c2)/(P(w1)+P(w2))

   comp:if ((a < 1 && a > b) || (a == 1 && b < singleCutRate) || (a == 0 && b == 0))

   则分割c1,指针向后移动i

   否则继续比较 前两个字 a=P(c2)/(P(w1)+P(w2)) 

                   和前三个字  b=P(c3)/(P(w1)+P(w2)+P(w3)) 的大小  循环到comp

    注意检测c的尾部并及时跳出循环,singleCutRate用于估算单字的切割概率,比如

   “中” 和 “中国” ,当 P(中国)/(P()+P()) >=0.05 认为“中国”是固有词汇,否则直接分割“中”,这个切割概率需要调教一个合理的数值,似乎最好的区间在0.005上下。

    yy完毕!

       


1楼GJYSK昨天 13:07
呵呵,有点意思,持续关注博主!
  相关解决方案