标签:mining embedded subt 频繁子树模式挖掘 prefixtreeespan
直接说这算法的作用吧,
上图的右边就是左边图的 ES(Embedded Subtree),相对应的ABC-1D-1-1E-1-1就是上图左边的Pre-Order-String,也是这个算法输入的数据格式,中间我们用空格隔开(A B C -1 D -1 -1 E -1 -1),-1就是表示回走。
这算法主要是用来挖掘频繁子树,也就是将一个树里的所有频繁子树找出来。原理就不说了,大家可以自己去搜下这篇论文,这是邹老师的一篇论文。这里我主要采用java来实现,所先针对GrowthElement、PrefixTree、PrefixTreeNode、ProjectedDB建立实体类,然后再递归实现搜索ES。
package iie.ucas.treeMiner.bean; public class GrowthElement { private String label = ""; private int attached = 0;// attached means that the GEs is attached to the private int times = 0;// 出现的次数,跟min_sup比较 // n-th node of the prefix-tree. public String getLabel() { return label; } public int getTimes() { return times; } public void setTimes(int times) { this.times = times; } public void setLabel(String label) { this.label = label; } public int getAttached() { return attached; } public void setAttached(int attached) { this.attached = attached; } public String toString(){ return "(" + label + ","+ getAttached() + "):" + getTimes(); } } <pre name="code" class="java">package iie.ucas.treeMiner.bean; public class PrefixTree { private ProjectedDB prodb; private PrefixTreeNode[] ptnodes; private int length = 0; private int times = 1; public PrefixTree(int n) { ptnodes = new PrefixTreeNode[n];// 频繁模式的prefix的label个数<A C -1 -1>,则n为2 for (int i = 0; i < n; i++) { ptnodes[i] = new PrefixTreeNode(); } length = n; } public PrefixTreeNode getPtnode(int index) { if (index < length) return ptnodes[index]; else return null; } public int getTimes() { return times; } public void setTimes(int times) { this.times = times; } public void setPtnode(int index, int numMinus, String label, int pos) { if (index < length) { ptnodes[index].setNumMinus(numMinus); ptnodes[index].setLabel(label); ptnodes[index].setPos(pos); } } public String toString() { String res = ""; for (int i = 0; i < length; i++) { if (i == 0) res += ptnodes[i].getLabel(); else { res += " " + ptnodes[i].getLabel(); } } return res; } public String toLastLabelString() { String res = ""; if(length == 1) return ptnodes[0].getLabel(); for (int i = 0; i < length; i++) { if (i == 0) { res += ptnodes[i].getLabel(); for (int j = 0; j < ptnodes[i].getNumMinus(); j++) { res += " " + "-1"; } } else if (i == length - 1) { res += " " + ptnodes[i].getLabel(); } else { res += " " + ptnodes[i].getLabel(); for (int j = 0; j < ptnodes[i].getNumMinus(); j++) { res += " " + "-1"; } } } return res; } public String toString(boolean isComplete) { String res = ""; for (int i = 0; i < length; i++) { if (i == 0) { res += ptnodes[i].getLabel(); for (int j = 0; j < ptnodes[i].getNumMinus(); j++) { res += " " + "-1"; } } else { res += " " + ptnodes[i].getLabel(); for (int j = 0; j < ptnodes[i].getNumMinus(); j++) { res += " " + "-1"; } } } return res; } public int getLength() { return length; } public void setLength(int length) { this.length = length; } public ProjectedDB getProdb() { return prodb; } public void setProdb(ProjectedDB prodb) { this.prodb = prodb; } public PrefixTreeNode[] getPtnode() { return ptnodes; } public void setPtnode(PrefixTreeNode[] ptnodes) { this.ptnodes = ptnodes; } }
package iie.ucas.treeMiner.bean; public class PrefixTreeNode { private String label = ""; private int pos = 0; private int numMinus = 0;// 用来表示在下一个label前着-1的个数 public int getNumMinus() { return numMinus; } public void setNumMinus(int numMinus) { this.numMinus = numMinus; } public String getLabel() { return label; } public void setLabel(String label) { this.label = label; } public int getPos() { return pos; } public void setPos(int pos) { this.pos = pos; } }
package iie.ucas.treeMiner.bean; import java.util.ArrayList; public class ProjectedDB { private ArrayList<String> pre_order_list;// 用来存放ProDB,eg.{A B C -1 -1- 1;A B // B -1 D -1 -1 -1} public ProjectedDB() { pre_order_list = new ArrayList<String>(); } public ArrayList<String> getPre_order_list() { return pre_order_list; } public void setPre_order_list(ArrayList<String> preOrderList) { pre_order_list = preOrderList; } }
package iie.ucas.treeMiner.action; import iie.ucas.treeMiner.bean.GrowthElement; import iie.ucas.treeMiner.bean.PrefixTree; import iie.ucas.treeMiner.bean.ProjectedDB; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map.Entry; public class PrefixTreeESpanAction { /** * 按行读取数据,并存放到ArrayList中 * * @param path * @return * @throws IOException * @throws IOException */ public ArrayList<String> file2arrlist(String path) throws IOException, IOException { ArrayList<String> arrlist = new ArrayList<String>(); File datafile = new File(path); String line = ""; BufferedReader input = new BufferedReader(new InputStreamReader( new FileInputStream(datafile), "utf-8")); while ((line = input.readLine()) != null) { arrlist.add(line); } input.close(); return arrlist; } /** * 统计arrlist中每个label出现的次数 * * @param arrlist * @return */ public HashMap<String, Integer> getAllLabel(ArrayList<String> arrlist) { HashMap<String, Integer> hashmap = new HashMap<String, Integer>(); for (int i = 0; i < arrlist.size(); i++) { String[] labels = arrlist.get(i).split(" ");// 返回第i行的所有label for (int j = 0; j < labels.length; j++) { if (!labels[j].equals("-1")) {// 筛选所有label if (hashmap.containsKey(labels[j])) {// label已经存在,直接在相应的value上加1 hashmap.put(labels[j], hashmap.get(labels[j]) + 1); } else { hashmap.put(labels[j], 1); } } } } return hashmap; } /** * 根据prefixtree构造相应的prodb * * @param prodb * @param pretree * @return */ public ProjectedDB getProjectedDB(ProjectedDB prodb, PrefixTree pretree) {// suppose // pretree:<A(1) // -1> ProjectedDB resprodb = new ProjectedDB(); ArrayList<String> pretlist = new ArrayList<String>(); String label = pretree.getPtnode()[pretree.getLength() - 1].getLabel(); System.out.println("<" + pretree.toString(true) + ">-projectedDB"); ArrayList<String> prodblist = prodb.getPre_order_list(); for (int i = 0; i < prodblist.size(); i++) { String[] labels = prodblist.get(i).split(" "); int length = labels.length; for (int k = pretree.getLength()-1; k < length; k++) { String prodbElement = label;//pretree.toLastLabelString();///?????????????????????????????????????? //System.out.println("lastlabelString :" + prodbElement); // String prodbElement = label;// 表示一行such as.{A C E -1 -1 -1} // System.out.println("prodbElement:" + prodbElement); if (label.equals(labels[k])) {// find all occurrence of label // ,and // construct prodb int contLabel = 1;//pretree.getLength();//???????? int minus = 0;// 用来标记后面出现-1的个数 for (int j = k+1; j < length; j++) {//从根label一样的那个位置的下一个位置开始 if (!labels[j].equals("-1")) {// not -1 contLabel++; prodbElement += " " + labels[j]; } else { minus++; prodbElement += " " + labels[j]; if (minus >= contLabel) {// //k = j + 1; break; } } }// break System.out.println("prodbElement:" + prodbElement); pretlist.add(prodbElement);// prodb的一项内容,也就是一行 }// end{if} } } resprodb.setPre_order_list(pretlist); return resprodb; } /** * 返回pretree对应的 GrowthElements * * @param prodb * @param min_sup * @param pretree * @return */ public HashSet<GrowthElement> getGEs(ProjectedDB prodb, PrefixTree pretree) { // ProjectedDB pdb = this.getProjectedDB(prodb, pretree); ArrayList<String> arrprodb = prodb.getPre_order_list();// pdb.getPre_order_list(); HashSet<GrowthElement> geset = new HashSet<GrowthElement>(); int prelength = pretree.getLength(); for (int i = 0; i < arrprodb.size(); i++) { String labels[] = arrprodb.get(i).split(" "); for (int j = prelength; j < labels.length; j++) { // if (prelength == 1) { // // } else {// pretree.getLength()!=1 int contLabel = 0; int minus = 0; if (!labels[j].equals("-1")) { if (contLabel >= minus) {// GE attached to // pretree.getLength() GrowthElement tmpge = null; boolean flag = false;// 用来表示新的GE是否之前已经找到 Iterator<GrowthElement> it = geset.iterator(); while (it.hasNext()) { tmpge = it.next(); if (tmpge.getLabel().equals(labels[j]) && tmpge.getAttached() == prelength) { flag = true; break; } }// break if (flag) {// 该GE之前就找到过了 tmpge.setTimes(tmpge.getTimes() + 1); geset.add(tmpge); } else { GrowthElement ge = new GrowthElement(); ge.setAttached(prelength); ge.setLabel(labels[j]); ge.setTimes(1); geset.add(ge); } } else {// ////////////////////////////////////////////////////// GrowthElement tmpge = null; boolean flag = false;// 用来表示新的GE是否之前已经找到 Iterator<GrowthElement> it = geset.iterator(); while (it.hasNext()) { tmpge = it.next(); if (tmpge.getLabel().equals(labels[j]) && tmpge.getAttached() == prelength) { flag = true; break; } }// break if (flag) {// 该GE之前就找到过了 tmpge.setTimes(tmpge.getTimes() + 1); geset.add(tmpge); } else { GrowthElement ge = new GrowthElement(); ge.setAttached(prelength + (contLabel - minus)); ge.setLabel(labels[j]); ge.setTimes(1); geset.add(ge); } } contLabel++; } else {// labels[j].equals("-1") minus++; } } } return geset; } public PrefixTree extendSByGE(GrowthElement ge, PrefixTree pretree, int min_sup) { PrefixTree respret = null; int length = pretree.getLength(); if (ge.getTimes() >= min_sup) {// 判断是否符合大于min_sup if (1 == length) { respret = new PrefixTree(length + 1); respret.setPtnode(0, 0, pretree.getPtnode(0).getLabel(), 1); respret.setPtnode(1, 2, ge.getLabel(), 2); } else { respret = new PrefixTree(length + 1); int attached = ge.getAttached(); if (attached == length) { for (int i = 0; i < length; i++) {// 将之前的信息复制过来eg.{A B -1 // -1}GE是(C,2)则{A B C -1 // -1 -1} respret.setPtnode(i, 0, pretree.getPtnode(i).getLabel(), i + 1); } respret.setPtnode(length, length + 1, ge.getLabel(), length + 1);// 设置新加入的GE的信息 } else {// ***************************************************************************88// for (int i = 0; i < length - 1; i++) {// 将之前的信息复制过来eg.{A B // -1 -1}GE是(C,2)则{A // B C -1 -1 -1} respret.setPtnode(i, pretree.getPtnode(i).getNumMinus(), pretree .getPtnode(i).getLabel(), pretree .getPtnode(i).getPos()); } // int index,int numMinus, String label,int pos respret.setPtnode(length - 1, (length - attached), pretree .getPtnode(length - 1).getLabel(), pretree .getPtnode(length - 1).getPos()); respret.setPtnode(length, attached + 1, ge.getLabel(), length + 1);// 设置新加入的GE的信息 } } } else {// end{if}判断是否符合大于min_sup return null; } System.err.println("添加GE后的prefix:" + respret.toString(true)); return respret; } public void Fre(PrefixTree pretree, ProjectedDB prodb, int min_sup) { HashSet<GrowthElement> geSet = this.getGEs(prodb, pretree); Iterator<GrowthElement> it = geSet.iterator(); while (it.hasNext()) { GrowthElement ge = it.next(); System.out.println("ge: " + ge.toString()); Fre(this.extendSByGE(ge, pretree, min_sup), this.getProjectedDB( prodb, this.extendSByGE(ge, pretree, min_sup)), min_sup); } return ; } public static void main(String[] args) throws IOException { PrefixTreeESpanAction ptes = new PrefixTreeESpanAction(); ArrayList<String> arrlist = ptes .file2arrlist("C:\\Users\\Fernando\\Desktop\\treedata\\CSlog.data");// 原始数据 HashMap<String, Integer> all_label = ptes.getAllLabel(arrlist); System.out.println("所有label数量: " + all_label.size()); ProjectedDB basedb = new ProjectedDB(); basedb.setPre_order_list(arrlist);// 最先的原始proDB PrefixTree pretrees[] = new PrefixTree[all_label.size()];// {a -1},{b // -1}.... for (int i = 0; i < pretrees.length; i++) { pretrees[i] = new PrefixTree(1);// 生成长度为1的prefixtree } Iterator iter = all_label.entrySet().iterator(); int index = 0; while (iter.hasNext()) { Entry entry = (Entry) iter.next(); pretrees[index].setLength(1); pretrees[index].getPtnode(0).setLabel((String) entry.getKey()); pretrees[index].getPtnode(0).setPos(1); pretrees[index].getPtnode(0).setNumMinus(1); index++; } int min_sup = 1; for (int i = 0; i < pretrees.length; i++) { ProjectedDB ptdb = ptes.getProjectedDB(basedb, pretrees[i]); ptes.Fre(pretrees[i],ptdb, min_sup); // HashSet<GrowthElement> geSet = ptes.getGEs(ptdb, pretrees[i]); // Iterator<GrowthElement> it = geSet.iterator(); // while (it.hasNext()) { // GrowthElement ge = it.next(); // System.out.println("ge: " + ge.toString()); // PrefixTree induce_pre = ptes.extendSByGE(ge, pretrees[i], // min_sup); // ProjectedDB induceDB = ptes.getProjectedDB(ptdb, induce_pre) ; // HashSet<GrowthElement> geSet1 = ptes.getGEs(induceDB, // induce_pre); // Iterator<GrowthElement> it1 = geSet1.iterator(); // while (it1.hasNext()) { // GrowthElement ge1 = it1.next(); // // System.out.println("ge1: " + ge.toString()); // PrefixTree induce_pre1 = ptes.extendSByGE(ge1, induce_pre, // min_sup); // } // System.out.println("======================"); // } } // ProjectedDB ptdb = new ProjectedDB(); // for (int i = 0; i < pretrees.length; i++) { // //{A -1}-proDB // ptdb = ptes.getProjectedDB(basedb, pretrees[i]);// /记录下来下次用得到 // HashSet<GrowthElement> geSet = ptes.getGEs(ptdb, pretrees[i]); // Iterator<GrowthElement> it = geSet.iterator(); // while (it.hasNext()) { // GrowthElement ge = it.next(); // System.out.println("ge: " + ge.toString()); // PrefixTree induce_pre = ptes.extendSByGE(ge, pretrees[i], min_sup); // ProjectedDB induceDB = ptes.getProjectedDB(ptdb, induce_pre) ; // HashSet<GrowthElement> inducegeSet = ptes.getGEs(induceDB, // pretrees[i]); // } // // } // =================================================================================================== // PrefixTree pretree = new PrefixTree(1); // pretree.getPtnode()[0].setLabel("72"); // pretree.getPtnode()[0].setPos(1); // // pretree.getPtnode()[0].setContainMinus(true);//<72(1) // // -1>,也就是第一个位置的标签是72 // // ***********// // // ProjectedDB prodb1 = ptes.getProjectedDB(prodb, pretree); // // ***********// // HashSet<GrowthElement> hashset = ptes.getGEs(prodb, pretree); // Iterator<GrowthElement> it = hashset.iterator(); // System.out.println("GEs:"); // while (it.hasNext()) { // GrowthElement ge = it.next(); // System.out.println("ge: " + ge.toString()); // ptes.extendSByGE(ge, pretree, 2); // } // ArrayList<String> treedata = new ArrayList<String>(); // File datafile = new File( // "C:\\Users\\Fernando\\Desktop\\treedata\\test.txt"); // BufferedReader input = new BufferedReader(new InputStreamReader( // new FileInputStream(datafile), "utf-8")); // // String line = ""; // // while ((line = input.readLine()) != null) { // treedata.add(line); // } // // treedata = buffer.toString(); // input.close(); // // System.out.println("lines:" + treedata.size()); // // String data[] = treedata.get(0).split(" "); // System.out.println(data[1].equals("-1")); // System.out.println(data[0] + ":" + data[1]); // HashSet<GrowthElement> geset = new HashSet<GrowthElement>(); // GrowthElement ge = new GrowthElement(); // geset.add(ge); // ge.setAttached(11); // geset.add(ge); // System.out.println(geset.size()); } }
prefixTreeEspan 频繁子树模式挖掘 A pattern growth 算法实现 mining embedded subtrees.
标签:mining embedded subt 频繁子树模式挖掘 prefixtreeespan
原文地址:http://blog.csdn.net/sh_c1991/article/details/41350383