该算法是为了实现对一些专业文章的词汇关联分析而实现的,并不是Apriori的最佳应用,确实对词频分析的一种实践。
package com.my.analysis; import java.util.ArrayList; import java.util.HashSet; import java.util.Set; import redis.clients.jedis.Jedis; public class AprioriMyImpl { private double minsup = 0.3;// 最小支持度 private double minconf = 0.99;// 最小置信度 private int limitword = 100;// 参加统计的 private ArrayList<Set<String>> aricleWL;// private ArrayList<Set<Set<String>>> candidateList;//候选项list private ArrayList<Set<Set<String>>> frequencyList;//频繁项list public Set<Set<String>> allSub = new HashSet<Set<String>>();//最大频繁项的所有子集 private long filecount;//文件的总数量 private int step = 1;//表示进行到第一步了 private Jedis jedis = new Jedis("localhost", 6379); public AprioriMyImpl() { candidateList = new ArrayList<Set<Set<String>>>(); frequencyList = new ArrayList<Set<Set<String>>>(); aricleWL = new ArrayList<Set<String>>(); filecount = jedis.llen(AnsjTxtFileParserForRedis.FILELIST); for(int i = 0;i < filecount;i++){ aricleWL.add(jedis.smembers(AnsjTxtFileParserForRedis.FILEPREFIX+i)); } } /** * 初始化第一个候选项集合 */ // public void item1_init(){ // Set<Set<String>> candidate1 = new HashSet<Set<String>>(); // Set<String> tset = jedis.zrevrange(AnsjTxtFileParserForRedis.TABLENAME, 0,limitword-1); // for(String s:tset){ // HashSet<String> one = new HashSet<String>(); // one.add(s); // candidate1.add(one); // } // candidateList.add(candidate1); // System.out.println("候选项集-"+(step)+":"); // printSetSetString(candidate1); // } public void item1_init(){ String[] keys ={"睡眠","时间","宝宝","治疗","疾病","身体","呼吸","质量","孩子","入睡","人体","精神","习惯","心理","障碍","枕头","保健","关注","医生","女性","症状","食物","饮食","运动","中医","床垫","儿童","婴儿","阅读","大脑","按摩","效果","癫痫","环境","营养","压力","血液","智能","休息","妈妈","男人","生理","医学","社会","药物","肌肉","男性","科技","恢复","减肥","放松","神经","危害","情绪","怀孕","午睡","分泌","下降","反馈","音乐","刺激","糖尿病","姿势","老人","熬夜","消化","记忆","消除","起床","客户","食品","感冒","高血压","招聘","老年人","孕妇","手表","解决","现象","超过","颈椎","全身","空调","侧卧","位置","体温","金笔","达到","打鼾","电视","能量","催眠","物质","状况","精力","作者","设备","价格","病人","保护","数据","经验","正文","适合","妇科","锻炼","新生儿","咳嗽","抑郁症","血管","抑制","幼儿","失眠症","心脏病","食疗","血压","肿瘤","诱发","重视","心血管","寿命","小便","免疫力","月经","评测","记忆力","智力"}; Set<Set<String>> candidate1 = new HashSet<Set<String>>(); for(String s:keys){ HashSet<String> one = new HashSet<String>(); one.add(s); candidate1.add(one); } candidateList.add(candidate1); System.out.println("候选项集-"+(step)+":"); printSetSetString(candidate1); } /** * 候选项集转化为频繁项集 */ public boolean candidateToFrequency(){ Set<Set<String>> candItems = candidateList.get(step-1); Set<Set<String>> freqItems = new HashSet<Set<String>>(); for(Set<String> item:candItems){ if((count_sup(item)/filecount)>=minsup){ freqItems.add(item); } } if(freqItems.size()==0){//无法产生符合条件的频繁项集 return false; } frequencyList.add(freqItems); System.out.println("频繁项集-"+(step)+":"); printSetSetString(freqItems);//输出频繁项集 step++; return true; } /** * 频繁项集形成新的候选项集 */ public boolean frequencyToCandidate(){ Set<Set<String>> frequencyItems = frequencyList.get(step-2); Set<String> maxSub = maxSubSet(frequencyItems); Set<Set<String>> candidateItems = new HashSet<Set<String>>(); for(Set<String> freqs : frequencyItems){ int len = freqs.size(); for(String sub:maxSub){ Set<String> pItem = new HashSet<String>(); pItem.addAll(freqs); pItem.add(sub); if(pItem.size()==(len+1)&&subIsFreq(frequencyItems,pItem)){ candidateItems.add(pItem); } } } if(candidateItems.size()==0){//没有形成新的候选集 return false; } candidateList.add(candidateItems); System.out.println("候选项集-"+(step)+":"); printSetSetString(candidateItems);//输出频繁项集 return true; } /** * parentSet的子集在频繁集合freq中 * @param freq * @param parentSet * @return true 是 ; false 否 */ public boolean subIsFreq(Set<Set<String>> freq,Set<String> parentSet){ for(String s:parentSet){ Set<String> item = new HashSet<String>(); item.addAll(parentSet); item.remove(s); if(!freq.contains(item)){ return false; } } return true; } /** * 获得频繁项集的最大项集 * @param freqIntems */ public Set<String> maxSubSet(Set<Set<String>> freqIntems){ Set<String> maxSub = new HashSet<String>(); for(Set<String> ss:freqIntems){ for(String s:ss){ maxSub.add(s); } } return maxSub; } /** * 计算支持度 * @param x * @return */ public double count_sup(Set<String> x){ int temp = 0; for(Set<String> ss:aricleWL){ if(ss.containsAll(x)){ temp++; } } return temp; } /** * 计算集合x=>y的置信度 * @param x * @param y * @return */ public double cout_cand(Set<String> x,Set<String> y){ Set<String> z = new HashSet<String>(); z.addAll(x); z.addAll(y); return count_sup(z)/count_sup(x); } /** * 获得所有的子集 * @param parent */ public void genSub(Set<String> parent){ if(parent.size()>0){ allSub.add(parent); } Set<String> ss = new HashSet<String>(); ss.addAll(parent); for(String s:ss){ Set<String> ss2 = new HashSet<String>(); ss2.addAll(ss); ss2.remove(s); genSub(ss2); } } /** * 输出 * @param sss */ public void printSetSetString(Set<Set<String>> sss){ for(Set<String> ss:sss){ System.out.println(ss); } } /** * 关联度分析 * @param subSet */ public void releRuleCount(Set<Set<String>> subSet){ for(Set<String> x:subSet){ for(Set<String> y:subSet){ Set<String> xy = new HashSet<String>(); xy.addAll(x); xy.addAll(y); if(xy.size()==(x.size()+y.size())){ double sup_count = cout_cand(x,y); if(sup_count>minconf){ System.out.println(x+"==>>"+y+"=="+sup_count); } } } } } public void jisuan(){ item1_init();//第一个候选项集的初始化 while(true){ if(!candidateToFrequency()) break; if(!frequencyToCandidate()) break; } Set<Set<String>> maxfreqs = frequencyList.get(frequencyList.size()-1); for(Set<String> maxfreq:maxfreqs){ allSub = new HashSet<Set<String>>(); genSub(maxfreq); releRuleCount(allSub); } } public static void main(String[] args) { //初始化候选项,取前几位word new AprioriMyImpl().jisuan(); } }
原文地址:http://blog.csdn.net/suifengerbi/article/details/45642845