标签:
注意代码里面用变量控制分类的写法 Uni Bi Tri
而且开始想的是两层循环,大循环找词,小循环再每行数数,这样会太慢了
所以借鉴源码的思路,是通过一个大循环,在dictionary中记录每个词出现多少词,同时用find标记这一句有没有标注1或0
wordpos1相当于记录的是每个词的n11,而对于每个词n.1都是一样的,就是出现标记1的话的总次数
wordneg1记录的是每个词的n01,出现目标词但是没有标1 ,同样对每个词n.0都是一样的,即没有标0的总次数
这个写的思路也告诉我要得到数据但是全都记录可能会比较复杂,记录间接数据也可以帮助得到最终结果
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using System.IO; namespace ConsoleApplication2 { class Program { static Dictionary<string, int> word = new Dictionary<string, int>(); static Dictionary<string, int> wordpos1 = new Dictionary<string, int>(); static Dictionary<string, int> wordneg1 = new Dictionary<string, int>(); static Dictionary<string, double> word_x= new Dictionary<string, double>(); static Dictionary<string, double> wordtf = new Dictionary<string, double>(); static Dictionary<string, double> wordidf = new Dictionary<string, double>(); static Dictionary<string, double> wordtfidf = new Dictionary<string, double>(); static int posfirst = 0; static int negfirst = 0; public enum Gram { UniGram = 0, BiGram = 1, TriGram = 2, } static void Create(Gram gram,StreamReader sr) { string line; int gramNum = Convert.ToInt32(gram); while((line=sr.ReadLine())!=null) { string[] take = line.Split(‘\t‘); int find1=0; if(take.Length>2 && take[2]=="1") { find1=1; posfirst++; } else{ negfirst++; } for (int k = 0; k < 2; k++) { for (int i = 0; i < take[k].Length - gramNum; i++) { string temp = string.Empty; for (int j = 0; j < gramNum + 1; j++) { temp += take[k][i + j]; } if (!word.ContainsKey(temp) )//如果word里面第一出现这个词 { word.Add(temp, 1); wordpos1.Add(temp, 0); wordneg1.Add(temp, 0); if (find1 == 1) { wordpos1[temp]++; } else //if ( find1 == 0 ) { wordneg1[temp]++; } } else if (word.ContainsKey(temp)) { word[temp]++; if (find1 == 1 ) { wordpos1[temp]++; } else if (find1 == 0 ) { wordneg1[temp]++; } } } } } var ite = from tae in wordpos1 orderby tae.Value descending select tae; var first = ite.First(); //找到出现在学习类中最大的频数 foreach (KeyValuePair<string, int> item in word) { double n11 = wordpos1[item.Key]; //c#不用转换可以直接将整数赋值给double double n10 = wordneg1[item.Key]; double n1 = posfirst; double n0 = negfirst; double n01 = n1 - n11; double n00 = n0 - n10; double chi = (n11 + n10 + n01 + n00) * (n11 * n00 - n10 * n01) * (n11 * n00 - n10 * n01) / ((n11 + n01) * (n11 + n10) * (n10 + n00) * (n01 + n00)); word_x.Add(item.Key, chi); double tf = n11 / first.Value; wordtf.Add(item.Key, tf); double idf = Math.Log(n1 / (n11 + 1)); wordidf.Add(item.Key, idf); double tfidf = tf * idf; wordtfidf.Add(item.Key, tfidf); } } static void Main(string[] args) { //因为变量名称分开写太多,所以还是分开运行的程序,原代码将三个情况改成函数输入可控比较巧妙 //uni case StreamReader sr = new StreamReader("C:\\Users\\v-yinqhe\\Desktop\\task\\classify\\tmp2.txt"); Create(Gram.UniGram, sr); //按照卡方排序 var items = from pair in word_x orderby pair.Value descending select pair; int n = 0; StreamWriter sw = new StreamWriter(@"C:\\Users\\v-yinqhe\\Desktop\\task\\classify\\result1.txt"); sw.WriteLine("string\t chi-square \t tf \t idf \t tf idf \t \n"); foreach (KeyValuePair<string, double> pair in items) { sw.WriteLine("{0}:{1},{2},{3},{4} \n", pair.Key, pair.Value,wordtf[pair.Key],wordidf[pair.Key],wordtfidf[pair.Key]); n++; if (n > 1000) { break; } } /* //bi case StreamReader sr = new StreamReader("C:\\Users\\v-yinqhe\\Desktop\\task\\classify\\tmp2.txt"); Create(Gram.BiGram, sr); var ite = from tae in wordpos1 orderby tae.Value descending select tae; var first = ite.First(); //找到出现在学习类中最大的频数 foreach (KeyValuePair<string, int> item in word) { double n11 = wordpos1[item.Key]; //c#不用转换可以直接将整数赋值给double double n10 = wordneg1[item.Key]; double n1 = posfirst; double n0 = negfirst; double n01 = n1 - n11; double n00 = n0 - n10; double chi = (n11 + n10 + n01 + n00) * (n11 * n00 - n10 * n01) * (n11 * n00 - n10 * n01) / ((n11 + n01) * (n11 + n10) * (n10 + n00) * (n01 + n00)); word_x.Add(item.Key, chi); double tf = n11 / first.Value; wordtf.Add(item.Key, tf); double idf = Math.Log(n1 / (n11 + 1)); wordidf.Add(item.Key, idf); double tfidf = tf * idf; wordtfidf.Add(item.Key, tfidf); } //按照卡方排序 var items = from pair in word_x orderby pair.Value descending select pair; int n = 0; StreamWriter sw = new StreamWriter(@"C:\\Users\\v-yinqhe\\Desktop\\task\\classify\\result2.txt"); sw.WriteLine("string\t chi-square \t \t tf \t \t idf \t\t tf idf \t \t\n"); foreach (KeyValuePair<string, double> pair in items) { sw.WriteLine("{0}:{1}, {2}, {3}, {4} \n", pair.Key, pair.Value, wordtf[pair.Key], wordidf[pair.Key], wordtfidf[pair.Key]); n++; if (n > 10000) { break; } } //tri case StreamReader sr = new StreamReader("C:\\Users\\v-yinqhe\\Desktop\\task\\classify\\tmp2.txt"); Create(Gram.TriGram, sr); var ite = from tae in wordpos1 orderby tae.Value descending select tae; var first = ite.First(); //找到出现在学习类中最大的频数 foreach (KeyValuePair<string, int> item in word) { double n11 = wordpos1[item.Key]; //c#不用转换可以直接将整数赋值给double double n10 = wordneg1[item.Key]; double n1 = posfirst; double n0 = negfirst; double n01 = n1 - n11; double n00 = n0 - n10; double chi = (n11 + n10 + n01 + n00) * (n11 * n00 - n10 * n01) * (n11 * n00 - n10 * n01) / ((n11 + n01) * (n11 + n10) * (n10 + n00) * (n01 + n00)); word_x.Add(item.Key, chi); double tf = n11 / first.Value; wordtf.Add(item.Key, tf); double idf = Math.Log(n1 / (n11 + 1)); wordidf.Add(item.Key, idf); double tfidf = tf * idf; wordtfidf.Add(item.Key, tfidf); } //按照卡方排序 var items = from pair in word_x orderby pair.Value descending select pair; int n = 0; StreamWriter sw = new StreamWriter(@"C:\\Users\\v-yinqhe\\Desktop\\task\\classify\\result3.txt"); sw.WriteLine("string\t chi-square \t \t tf \t \t idf \t\t tf idf \t \t\n"); foreach (KeyValuePair<string, double> pair in items) { sw.WriteLine("{0}:{1}, {2}, {3}, {4} \n", pair.Key, pair.Value, wordtf[pair.Key], wordidf[pair.Key], wordtfidf[pair.Key]); n++; if (n > 10000) { break; } } */ } } }
using chi-square to analyze a word's relationship with two great parts
标签:
原文地址:http://www.cnblogs.com/sumile123/p/4735194.html