码迷,mamicode.com
首页 > 其他好文 > 详细

using chi-square to analyze a word's relationship with two great parts

时间:2015-08-17 00:39:08      阅读:145      评论:0      收藏:0      [点我收藏+]

标签:

注意代码里面用变量控制分类的写法    Uni Bi Tri

而且开始想的是两层循环,大循环找词,小循环再每行数数,这样会太慢了

所以借鉴源码的思路,是通过一个大循环,在dictionary中记录每个词出现多少词,同时用find标记这一句有没有标注1或0

wordpos1相当于记录的是每个词的n11,而对于每个词n.1都是一样的,就是出现标记1的话的总次数

wordneg1记录的是每个词的n01,出现目标词但是没有标1 ,同样对每个词n.0都是一样的,即没有标0的总次数

这个写的思路也告诉我要得到数据但是全都记录可能会比较复杂,记录间接数据也可以帮助得到最终结果

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;

namespace ConsoleApplication2
{
    class Program
    {
        static Dictionary<string, int> word = new Dictionary<string, int>();
        static Dictionary<string, int> wordpos1 = new Dictionary<string, int>();
        static Dictionary<string, int> wordneg1 = new Dictionary<string, int>();

        static Dictionary<string, double> word_x= new Dictionary<string, double>();
        static Dictionary<string, double> wordtf = new Dictionary<string, double>();
        static Dictionary<string, double> wordidf = new Dictionary<string, double>();
        static Dictionary<string, double> wordtfidf = new Dictionary<string, double>();

        static int posfirst = 0;
        static int negfirst = 0;
        public enum Gram
        {
            UniGram = 0,
            BiGram = 1,
            TriGram = 2,
        }
        static void Create(Gram gram,StreamReader sr)
        {          
            string line;            
            int gramNum = Convert.ToInt32(gram);
            while((line=sr.ReadLine())!=null)
            {
                string[] take = line.Split(\t);
                
                int find1=0;
                if(take.Length>2 && take[2]=="1")
                {
                    find1=1;
                    posfirst++;
                }
                else{
                    negfirst++;
                }
                for (int k = 0; k < 2; k++)
                {
                    for (int i = 0; i < take[k].Length - gramNum; i++)
                    {
                        string temp = string.Empty;
                        for (int j = 0; j < gramNum + 1; j++)
                        {
                            temp += take[k][i + j];
                        }
                        if (!word.ContainsKey(temp) )//如果word里面第一出现这个词
                        {
                            word.Add(temp, 1);
                            wordpos1.Add(temp, 0);
                            wordneg1.Add(temp, 0);
                            if (find1 == 1)
                            {
                                wordpos1[temp]++;
                            }
                            else //if ( find1 == 0 )
                            {
                                wordneg1[temp]++;
                            }
                        }
                        else if (word.ContainsKey(temp))
                        {
                            word[temp]++;
                            if (find1 == 1 )
                            {
                                wordpos1[temp]++;
                            }
                            else if (find1 == 0 )
                            {
                                wordneg1[temp]++;
                            }
                        }
                    }
                }

            }

            var ite = from tae in wordpos1 orderby tae.Value descending select tae;
            var first = ite.First();  //找到出现在学习类中最大的频数

            foreach (KeyValuePair<string, int> item in word)
            {
                double n11 = wordpos1[item.Key];  //c#不用转换可以直接将整数赋值给double
                double n10 = wordneg1[item.Key];
                double n1 = posfirst;
                double n0 = negfirst;
                double n01 = n1 - n11;
                double n00 = n0 - n10;
                double chi = (n11 + n10 + n01 + n00) * (n11 * n00 - n10 * n01) * (n11 * n00 - n10 * n01) / ((n11 + n01) * (n11 + n10) * (n10 + n00) * (n01 + n00));
                word_x.Add(item.Key, chi);
                double tf = n11 / first.Value;
                wordtf.Add(item.Key, tf);
                double idf = Math.Log(n1 / (n11 + 1));
                wordidf.Add(item.Key, idf);
                double tfidf = tf * idf;
                wordtfidf.Add(item.Key, tfidf);
            }


        }
     
        

        static void Main(string[] args)
        {
             //因为变量名称分开写太多,所以还是分开运行的程序,原代码将三个情况改成函数输入可控比较巧妙
            //uni case 
            StreamReader sr = new StreamReader("C:\\Users\\v-yinqhe\\Desktop\\task\\classify\\tmp2.txt");
            Create(Gram.UniGram, sr);

            //按照卡方排序
            var items = from pair in word_x orderby pair.Value descending select pair;
            int n = 0;
            StreamWriter sw = new StreamWriter(@"C:\\Users\\v-yinqhe\\Desktop\\task\\classify\\result1.txt");
            sw.WriteLine("string\t chi-square \t tf \t idf \t tf idf \t \n");
            foreach (KeyValuePair<string, double> pair in items)
            {
                sw.WriteLine("{0}:{1},{2},{3},{4} \n", pair.Key, pair.Value,wordtf[pair.Key],wordidf[pair.Key],wordtfidf[pair.Key]);
                n++;
                if (n > 1000)
                {
                    break;
                }
            }
            
/*
            //bi case
            StreamReader sr = new StreamReader("C:\\Users\\v-yinqhe\\Desktop\\task\\classify\\tmp2.txt");
            Create(Gram.BiGram, sr);

            var ite = from tae in wordpos1 orderby tae.Value descending select tae;
            var first = ite.First();  //找到出现在学习类中最大的频数

            foreach (KeyValuePair<string, int> item in word)
            {
                double n11 = wordpos1[item.Key];  //c#不用转换可以直接将整数赋值给double
                double n10 = wordneg1[item.Key];
                double n1 = posfirst;
                double n0 = negfirst;
                double n01 = n1 - n11;
                double n00 = n0 - n10;
                double chi = (n11 + n10 + n01 + n00) * (n11 * n00 - n10 * n01) * (n11 * n00 - n10 * n01) / ((n11 + n01) * (n11 + n10) * (n10 + n00) * (n01 + n00));
                word_x.Add(item.Key, chi);
                double tf = n11 / first.Value;
                wordtf.Add(item.Key, tf);
                double idf = Math.Log(n1 / (n11 + 1));
                wordidf.Add(item.Key, idf);
                double tfidf = tf * idf;
                wordtfidf.Add(item.Key, tfidf);
            }

            //按照卡方排序
            var items = from pair in word_x orderby pair.Value descending select pair;
            int n = 0;
            StreamWriter sw = new StreamWriter(@"C:\\Users\\v-yinqhe\\Desktop\\task\\classify\\result2.txt");
            sw.WriteLine("string\t chi-square \t \t  tf \t \t  idf \t\t tf idf \t \t\n");
            foreach (KeyValuePair<string, double> pair in items)
            {
                sw.WriteLine("{0}:{1},  {2},  {3},  {4} \n", pair.Key, pair.Value, wordtf[pair.Key], wordidf[pair.Key], wordtfidf[pair.Key]);
                n++;
                if (n > 10000)
                {
                    break;
                }
            }









            //tri case
            StreamReader sr = new StreamReader("C:\\Users\\v-yinqhe\\Desktop\\task\\classify\\tmp2.txt");
            Create(Gram.TriGram, sr);

            var ite = from tae in wordpos1 orderby tae.Value descending select tae;
            var first = ite.First();  //找到出现在学习类中最大的频数

            foreach (KeyValuePair<string, int> item in word)
            {
                double n11 = wordpos1[item.Key];  //c#不用转换可以直接将整数赋值给double
                double n10 = wordneg1[item.Key];
                double n1 = posfirst;
                double n0 = negfirst;
                double n01 = n1 - n11;
                double n00 = n0 - n10;
                double chi = (n11 + n10 + n01 + n00) * (n11 * n00 - n10 * n01) * (n11 * n00 - n10 * n01) / ((n11 + n01) * (n11 + n10) * (n10 + n00) * (n01 + n00));
                word_x.Add(item.Key, chi);
                double tf = n11 / first.Value;
                wordtf.Add(item.Key, tf);
                double idf = Math.Log(n1 / (n11 + 1));
                wordidf.Add(item.Key, idf);
                double tfidf = tf * idf;
                wordtfidf.Add(item.Key, tfidf);
            }

            //按照卡方排序
            var items = from pair in word_x orderby pair.Value descending select pair;
            int n = 0;
            StreamWriter sw = new StreamWriter(@"C:\\Users\\v-yinqhe\\Desktop\\task\\classify\\result3.txt");
            sw.WriteLine("string\t chi-square \t \t  tf \t \t  idf \t\t tf idf \t \t\n");
            foreach (KeyValuePair<string, double> pair in items)
            {
                sw.WriteLine("{0}:{1},  {2},  {3},  {4} \n", pair.Key, pair.Value, wordtf[pair.Key], wordidf[pair.Key], wordtfidf[pair.Key]);
                n++;
                if (n > 10000)
                {
                    break;
                }
            }
  */          
        }
    }
}

 

using chi-square to analyze a word's relationship with two great parts

标签:

原文地址:http://www.cnblogs.com/sumile123/p/4735194.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!