码迷,mamicode.com
首页 > Windows程序 > 详细

由隐马尔科夫意淫无字典中文分词 C#

时间:2015-04-07 11:58:15      阅读:319      评论:0      收藏:0      [点我收藏+]

标签:

using System;
using System.Windows.Forms;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
using System.Collections.Generic;
using System.ComponentModel;

namespace HMM
{
    public partial class Form1 : Form
    {
        string[] arrayData;
        DirectoryInfo di;
        FileInfo[] fis;
        Hashtable htDict = new Hashtable();
        double singleCutRate;

        public Form1()
        {
            InitializeComponent();
            label1.Text = "先预处理!";
            progressBar1.Visible = false;
            di = new DirectoryInfo("data");
            fis = di.GetFiles("*.txt");
            arrayData = new string[fis.Length];
            if (!double.TryParse(textBox3.Text.Trim(), out singleCutRate))
            {
                singleCutRate = 0.05;
            }
        }

        private void Form1_Resize(object sender, EventArgs e)
        {
            this.Width = 800;
            this.Height = 600;
        }

        private void button1_Click(object sender, EventArgs e)
        {
            if (!new FileInfo("dict.txt").Exists)
            {
                int count = 0;
                progressBar1.Visible = true;
                BackgroundWorker worker = new BackgroundWorker();
                worker.WorkerReportsProgress = true;  //报告进度  
                worker.DoWork += (s, o) =>
                {
                    int progressCount = 1;
                    foreach (FileInfo i in fis)
                    {
                        StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default);
                        arrayData[progressCount - 1] = sr.ReadToEnd();
                        sr.Close();
                        worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null);
                        progressCount++;
                    }
                    for (int i = 0; i < arrayData.Length; i++)
                    {
                        arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", "");
                        for (int j = 0; j < arrayData[i].Length; j++)
                        {
                            string strWord = arrayData[i].Substring(j, 1);
                            if (IsChinese(strWord))
                            {
                                if (htDict.ContainsKey(strWord))
                                {
                                    htDict[strWord] = ((int)htDict[strWord]) + 1;
                                }
                                else
                                {
                                    htDict.Add(strWord, 1);
                                }
                            }
                        }
                        worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null);
                    }
                    StreamWriter sw = new StreamWriter("dict.txt", false, System.Text.Encoding.Default);
                    foreach (DictionaryEntry i in htDict)
                    {
                        sw.WriteLine(i.Key + "|" + i.Value);
                        count++;
                        sw.Flush();
                        worker.ReportProgress((int)((double)count / (double)htDict.Count * 33) + 67, null);
                    }
                    sw.Close();
                };
                worker.RunWorkerCompleted += (s, o) =>
                {
                    this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成!|" + count; }));
                };
                worker.ProgressChanged += (s, o) =>
                {
                    progressBar1.Style = ProgressBarStyle.Continuous;
                    progressBar1.Value = o.ProgressPercentage;
                };
                worker.RunWorkerAsync();
            }
            else
            {
                int count = 0;
                progressBar1.Visible = true;
                BackgroundWorker worker = new BackgroundWorker();
                worker.WorkerReportsProgress = true;  //报告进度  
                worker.DoWork += (s, o) =>
                {
                    int progressCount = 1;
                    foreach (FileInfo i in fis)
                    {
                        StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default);
                        arrayData[progressCount - 1] = sr.ReadToEnd();
                        sr.Close();
                        worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null);
                        progressCount++;
                    }
                    for (int i = 0; i < arrayData.Length; i++)
                    {
                        arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", "");
                        worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null);
                    }
                    StreamReader reader = new StreamReader("dict.txt", System.Text.Encoding.Default);
                    string line = "";
                    while ((line = reader.ReadLine()) != null)
                    {
                        htDict[line.Substring(0, 1)] = line.Substring(2);
                        count++;
                    }
                    reader.Close();
                    worker.ReportProgress(100, null);
                };
                worker.RunWorkerCompleted += (s, o) =>
                {
                    this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成!|" + count; }));
                };
                worker.ProgressChanged += (s, o) =>
                {
                    progressBar1.Style = ProgressBarStyle.Continuous;
                    progressBar1.Value = o.ProgressPercentage;
                };
                worker.RunWorkerAsync();
            }
        }

        private void button2_Click(object sender, EventArgs e)
        {
            if (label1.Text != "先预处理!" && textBox1.Text.Trim() != "")
            {
                List<string> list = new List<string>();
                string strSplitWords = Regex.Replace(textBox1.Text.Trim(), @"[^\u4e00-\u9fa5]", "");
                int startPos = 0;
                int m = 1;
                string strWord1 = "";
                string strWord2 = "";
                progressBar1.Visible = true;
                BackgroundWorker worker = new BackgroundWorker();
                worker.WorkerReportsProgress = true;  //报告进度
                worker.DoWork += (s, o) =>
                {
                    while (strSplitWords.Length >= 2)
                    {
                        if (strWord1 == "")
                        {
                            strWord1 = strSplitWords.Substring(startPos, m);
                        }
                        strWord2 = strSplitWords.Substring(startPos, ++m);
                        double x1 = (double)ReturnCount(strWord1, arrayData);
                        double y1 = (double)ReturnTotalCount(strWord1);
                        if (y1 == 0)
                            y1++;
                        double a = x1 / y1;
                        double x2 = (double)ReturnCount(strWord2, arrayData);
                        double y2 = (double)ReturnTotalCount(strWord2);
                        if (y2 == 0)
                            y2++;
                        double b = x2 / y2;
                        if ((a < 1 && a > b) || (a == 1 && b < singleCutRate) || (a == 0 && b == 0))
                        {
                            list.Add(strWord1);
                            startPos += strWord1.Length;
                            worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null);
                            m = 1;
                            strWord1 = "";
                            strWord2 = "";
                            if ((strSplitWords.Length - startPos) == 1)
                            {
                                list.Add(strSplitWords.Substring(startPos, 1));
                                break;
                            }
                            else if ((strSplitWords.Length - startPos) < 1)
                            {
                                break;
                            }
                        }
                        else
                        {
                            strWord1 = strWord2;
                            strWord2 = "";
                            if ((strSplitWords.Length - startPos - m) < 1)
                            {
                                list.Add(strWord1);
                                startPos += strWord1.Length;
                                worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null);
                                break;
                            }
                        }
                    }
                    worker.ReportProgress(100, null);
                };
                worker.RunWorkerCompleted += (s, o) =>
         {
             this.Invoke(new MethodInvoker(() =>
             {
                 progressBar1.Visible = false;
                 progressBar1.Value = 0;
                 foreach (string i in list)
                 {
                     textBox2.Text += i + "|";
                 }
                 label2.Text = "分词完成!";
             }));
         };
                worker.ProgressChanged += (s, o) =>
                {
                    progressBar1.Style = ProgressBarStyle.Continuous;
                    progressBar1.Value = o.ProgressPercentage;
                };
                worker.RunWorkerAsync();
            }
        }

        public bool IsChinese(string str)
        {
            return Regex.IsMatch(str, @"^[\u4e00-\u9fa5]+$");
        }

        public int ReturnCount(string s, string[] d)
        {
            int count = 0;
            for (int i = 0; i < d.Length; i++)
            {
                int pos = 0;
                while (true)
                {
                    pos = d[i].IndexOf(s, pos);
                    if (pos != -1)
                    {
                        pos++;
                        count++;
                    }
                    else
                    {
                        break;
                    }
                }
            }
            return count;
        }

        public int ReturnTotalCount(string s)
        {
            int total = 0;
            for (int i = 0; i < s.Length; i++)
            {
                if (htDict.ContainsKey(s.Substring(i, 1)))
                {
                    total += Convert.ToInt32(htDict[s.Substring(i, 1)]);
                }
            }
            return total;
        }
    }
}

技术分享

最近在看机器学习方面的书,看到隐马尔科夫,意淫了一下无字典中文分词的可能性,我设想了一种分词方式,并无聊了一个程序,因为执行效率相当差,所以添加了进度条,否则真的等到受不了,仅供参考

1、下载了3200本各类电子书,600多M

2、预先扫描每个字出现的概率P(w)

3、待分词内容c中非中文字符

4、从左向右扫描ci,(i为字数,每一位为w1、w2、w3.....wi),

       开始时:计算第一个字c1的a=P(c1)/P(w1)和前两个字c2的b=P(c2)/(P(w1)+P(w2))

   comp:if ((a < 1 && a > b) || (a == 1 && b < singleCutRate) || (a == 0 && b == 0))

   则分割c1,指针前后移动i

   否则继续比较 前两个字 a=P(c2)/(P(w1)+P(w2)) 

                   和前三个字  b=P(c3)/(P(w1)+P(w2)+P(w3)) 的大小  循环到comp

    注意检测c的尾部并及时跳出循环,singleCutRate用于估算单字的切割概率,比如

   “中” 和 “中国” ,当 P(中国)/(P()+P()) >=0.05 认为“中国”是固有词汇,否则直接分割“中”,这个切割概率需要调教一个合理的数值。

    yy完毕!

       


由隐马尔科夫意淫无字典中文分词 C#

标签:

原文地址:http://blog.csdn.net/joycesunny/article/details/44856007

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!