标签:
using System; using System.Windows.Forms; using System.IO; using System.Text.RegularExpressions; using System.Collections; using System.Collections.Generic; using System.ComponentModel; namespace HMM { public partial class Form1 : Form { string[] arrayData; DirectoryInfo di; FileInfo[] fis; Hashtable htDict = new Hashtable(); double singleCutRate; public Form1() { InitializeComponent(); label1.Text = "先预处理!"; progressBar1.Visible = false; di = new DirectoryInfo("data"); fis = di.GetFiles("*.txt"); arrayData = new string[fis.Length]; if (!double.TryParse(textBox3.Text.Trim(), out singleCutRate)) { singleCutRate = 0.05; } } private void Form1_Resize(object sender, EventArgs e) { this.Width = 800; this.Height = 600; } private void button1_Click(object sender, EventArgs e) { if (!new FileInfo("dict.txt").Exists) { int count = 0; progressBar1.Visible = true; BackgroundWorker worker = new BackgroundWorker(); worker.WorkerReportsProgress = true; //报告进度 worker.DoWork += (s, o) => { int progressCount = 1; foreach (FileInfo i in fis) { StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default); arrayData[progressCount - 1] = sr.ReadToEnd(); sr.Close(); worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null); progressCount++; } for (int i = 0; i < arrayData.Length; i++) { arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", ""); for (int j = 0; j < arrayData[i].Length; j++) { string strWord = arrayData[i].Substring(j, 1); if (IsChinese(strWord)) { if (htDict.ContainsKey(strWord)) { htDict[strWord] = ((int)htDict[strWord]) + 1; } else { htDict.Add(strWord, 1); } } } worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null); } StreamWriter sw = new StreamWriter("dict.txt", false, System.Text.Encoding.Default); foreach (DictionaryEntry i in htDict) { sw.WriteLine(i.Key + "|" + i.Value); count++; sw.Flush(); worker.ReportProgress((int)((double)count / (double)htDict.Count * 33) + 67, null); } sw.Close(); }; worker.RunWorkerCompleted += (s, o) => { this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成!|" + count; })); }; worker.ProgressChanged += (s, o) => { progressBar1.Style = ProgressBarStyle.Continuous; progressBar1.Value = o.ProgressPercentage; }; worker.RunWorkerAsync(); } else { int count = 0; progressBar1.Visible = true; BackgroundWorker worker = new BackgroundWorker(); worker.WorkerReportsProgress = true; //报告进度 worker.DoWork += (s, o) => { int progressCount = 1; foreach (FileInfo i in fis) { StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default); arrayData[progressCount - 1] = sr.ReadToEnd(); sr.Close(); worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null); progressCount++; } for (int i = 0; i < arrayData.Length; i++) { arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", ""); worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null); } StreamReader reader = new StreamReader("dict.txt", System.Text.Encoding.Default); string line = ""; while ((line = reader.ReadLine()) != null) { htDict[line.Substring(0, 1)] = line.Substring(2); count++; } reader.Close(); worker.ReportProgress(100, null); }; worker.RunWorkerCompleted += (s, o) => { this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成!|" + count; })); }; worker.ProgressChanged += (s, o) => { progressBar1.Style = ProgressBarStyle.Continuous; progressBar1.Value = o.ProgressPercentage; }; worker.RunWorkerAsync(); } } private void button2_Click(object sender, EventArgs e) { if (label1.Text != "先预处理!" && textBox1.Text.Trim() != "") { List<string> list = new List<string>(); string strSplitWords = Regex.Replace(textBox1.Text.Trim(), @"[^\u4e00-\u9fa5]", ""); int startPos = 0; int m = 1; string strWord1 = ""; string strWord2 = ""; progressBar1.Visible = true; BackgroundWorker worker = new BackgroundWorker(); worker.WorkerReportsProgress = true; //报告进度 worker.DoWork += (s, o) => { while (strSplitWords.Length >= 2) { if (strWord1 == "") { strWord1 = strSplitWords.Substring(startPos, m); } strWord2 = strSplitWords.Substring(startPos, ++m); double x1 = (double)ReturnCount(strWord1, arrayData); double y1 = (double)ReturnTotalCount(strWord1); if (y1 == 0) y1++; double a = x1 / y1; double x2 = (double)ReturnCount(strWord2, arrayData); double y2 = (double)ReturnTotalCount(strWord2); if (y2 == 0) y2++; double b = x2 / y2; if ((a < 1 && a > b) || (a == 1 && b < singleCutRate) || (a == 0 && b == 0)) { list.Add(strWord1); startPos += strWord1.Length; worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null); m = 1; strWord1 = ""; strWord2 = ""; if ((strSplitWords.Length - startPos) == 1) { list.Add(strSplitWords.Substring(startPos, 1)); break; } else if ((strSplitWords.Length - startPos) < 1) { break; } } else { strWord1 = strWord2; strWord2 = ""; if ((strSplitWords.Length - startPos - m) < 1) { list.Add(strWord1); startPos += strWord1.Length; worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null); break; } } } worker.ReportProgress(100, null); }; worker.RunWorkerCompleted += (s, o) => { this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; foreach (string i in list) { textBox2.Text += i + "|"; } label2.Text = "分词完成!"; })); }; worker.ProgressChanged += (s, o) => { progressBar1.Style = ProgressBarStyle.Continuous; progressBar1.Value = o.ProgressPercentage; }; worker.RunWorkerAsync(); } } public bool IsChinese(string str) { return Regex.IsMatch(str, @"^[\u4e00-\u9fa5]+$"); } public int ReturnCount(string s, string[] d) { int count = 0; for (int i = 0; i < d.Length; i++) { int pos = 0; while (true) { pos = d[i].IndexOf(s, pos); if (pos != -1) { pos++; count++; } else { break; } } } return count; } public int ReturnTotalCount(string s) { int total = 0; for (int i = 0; i < s.Length; i++) { if (htDict.ContainsKey(s.Substring(i, 1))) { total += Convert.ToInt32(htDict[s.Substring(i, 1)]); } } return total; } } }
最近在看机器学习方面的书,看到隐马尔科夫,意淫了一下无字典中文分词的可能性,我设想了一种分词方式,并无聊了一个程序,因为执行效率相当差,所以添加了进度条,否则真的等到受不了,仅供参考
1、下载了3200本各类电子书,600多M
2、预先扫描每个字出现的概率P(w)
3、清除待分词内容c中非中文字符
4、从左向右扫描ci,(i为字数,每一位为w1、w2、w3.....wi),
开始时:计算第一个字c1的a=P(c1)/P(w1)和前两个字c2的b=P(c2)/(P(w1)+P(w2))
comp:if ((a < 1 && a > b) || (a == 1 && b < singleCutRate) || (a == 0 && b == 0))
则分割c1,指针前后移动i
否则继续比较 前两个字 a=P(c2)/(P(w1)+P(w2))
和前三个字 b=P(c3)/(P(w1)+P(w2)+P(w3)) 的大小 循环到comp
注意检测c的尾部并及时跳出循环,singleCutRate用于估算单字的切割概率,比如
“中” 和 “中国” ,当 P(中国)/(P(中)+P(国)) >=0.05 认为“中国”是固有词汇,否则直接分割“中”,这个切割概率需要调教一个合理的数值。
yy完毕!
标签:
原文地址:http://blog.csdn.net/joycesunny/article/details/44856007