标签:des style blog http io ar color 使用 sp
***********************************************声明******************************************************
原创作品,出自 “晓风残月xj” 博客,欢迎转载,转载时请务必注明出处(http://blog.csdn.net/xiaofengcanyuexj)。
由于各种原因,可能存在诸多不足,欢迎斧正!
*********************************************************************************************************
前面提到关联规则寻找频繁项集的Apriori算法,Apriori算法是挖掘布尔型关联规则频繁项集的最为经典、最为基本的算法,但是该算法需要不断寻找候选集,然后剪枝即去掉包含非频繁子集的候选集 ,效率不是很高,时间复杂度由暴力枚举所有子集的指数级别O(n^2) 降为多项式级别,多项式具体系数是底层实现情况而定
。Apriori算法的主要瓶颈在于不断寻找候选项集,可不可以找到一种不用频繁寻找候选项集的算法呢?而且当待挖掘的数据很大进而需要存储在数据库中时,Apriori算法还有一个无可回避的问题就是每次都要扫描数据库,涉及大量I/O操作,比较耗时(当然可以不用数据库)。
FP_Gwoth算法是一种不生成候选集从而寻找频繁项集的算法,主要基于树结构:包含一个一棵FP_Tree和一个项头表,每个项通过一个结点链指向它在树中出现的位置。基本结构如下所示。需要注意的是项头表需要按照支持度递减排序,在FP_Tree(有后缀的也成条件FP_Tree)中高支持度的节点只能是低支持度节点的祖先节点。这样一来可以保证尽可能的共用祖先节点,更重要的是保证正确性。
procedure FP_Growth(FP_Tree, α)
if FP_Tree 只含单个路径P then{
【1】
for 路径P中结点的每个组合(记作β)
【2】
产生模式βUα,其支持度MinSupport =β 中结点的最小支持度;
}
else{
for each αi 在FP_Tree的项头表(按照支持度由低到高顺序进行扫描){ 【3】
产生一个模式β= αiUβ,其支持度MinSupport=αi.MinSupport;
构造β的条件模式基,然后构造β的条件FP_Treeβ;
【4】
if FP_Tree不为空 then
调用 FP_Growth (FP_Treeβ, β);
}
}
【1】 FP_Tree 只含单个路径P,即只有一条分支且分支不能分叉,如果分叉可能隐含了分支合并问题,可能导致在为合并之前误删为不满足最小支持度;
【2】 若分支上有n个属性值,则总共有2^n组合,可以每个属性值取或不取两种情况递归下去;
【3】当前条件FP_Tree的的项头表,用尾插法建立单链表;
【4】
以当前项头表的αi沿着条件FP_Tree的每条分支向上找出所有条件模式,然后建立后缀模式β的条件FP_Treeβ
源代码:
在此声明,以下代码并不是系本人原创,如需使用,必须声明,谢谢!
//FP_Tree.h
/**
* Created by xujin on 2014/12/4.
All Rights Reserved,but you can use this program.
*/
#ifndef FP_TREE_H
#define FP_TREE_H
#include"Transaction.h"
#include"TransactionSet.h"
#include<map>
using namespace std;
const int MAXN_CHILD=20;
typedef string ItemType;
struct ItemSupport
{
ItemType m_ITItemName;
int m_nSupportCount;
ItemSupport(ItemType tItem,int tSup)
{
m_ITItemName=tItem;
m_nSupportCount=tSup;
}
};
struct CFP_TreeNode
{
int m_nSupportCount;
int m_nChildSize;
ItemType m_ITItemName;
CFP_TreeNode *m_pFatherNode;
CFP_TreeNode *m_pLinkedNode;
CFP_TreeNode *m_pChildNode[MAXN_CHILD];
CFP_TreeNode()
{
m_ITItemName.clear();
m_nChildSize=0;
m_nSupportCount =0;
m_pFatherNode=NULL;
m_pLinkedNode=NULL;
for(int i=0;i<MAXN_CHILD;++i)
m_pChildNode[i]=NULL;
}
CFP_TreeNode(int tCount)
{
m_ITItemName.clear();
m_nChildSize=0;
m_nSupportCount =tCount;
m_pFatherNode=NULL;
m_pLinkedNode=NULL;
for(int i=0;i<MAXN_CHILD;++i)
m_pChildNode[i]=NULL;
}
CFP_TreeNode(ItemType tITtem,CFP_TreeNode *tFa,CFP_TreeNode *tLinked,int tCount)
{
m_ITItemName=tITtem;
m_nChildSize=0;
m_nSupportCount=tCount;
m_pFatherNode =tFa;
m_pLinkedNode=tLinked;
for(int i=0;i<MAXN_CHILD;++i)
m_pChildNode[i]=NULL;
}
};
struct CItemHeaderNode
{
int m_nSupportCount;
CFP_TreeNode *m_pFPFirst;
CItemHeaderNode()
{
m_nSupportCount =0 ;
m_pFPFirst = NULL;
}
CItemHeaderNode(int tCount)
{
m_nSupportCount =tCount ;
m_pFPFirst = NULL;
}
};
class CFP_Tree
{
private:
double m_dMinConfidence;
double m_dMinSupport;
int m_nSize;
int m_nMinConfidence;
int m_nMinSupport;
CFP_TreeNode *m_pCFP_TreeRoot;
private:
void insertFPTree(CFP_TreeNode *tRoot,CTransaction &tTran,int id,int tCount);
void DFSPrintPath(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet);
void printLinkList(CFP_TreeNode *tRoot);
void destroy(CFP_TreeNode *tRoot);
public:
vector<ItemSupport>m_vecItemSupportSet;
map<ItemType,CItemHeaderNode>m_mapItemHeaderList;
void sortMapItemHeaderList();
void addItem(ItemType tItem,int tCount);
void eraseInfrequent1ItemSet();
/***********************************************
*
*功能:对tTranSet进行计数支持度从大到小排序
*
***********************************************/
void sortTransactionSet(CTransactionSet &tTranSet);
CFP_Tree(CTransactionSet &tTranSet,double tMinCon,double tMinSup,int tCount);
CFP_Tree(double tMinCon,double tMinSup,int tSize);
void insertFPTree(CTransaction &tTran,int id,int tCount);
void printPath();
void printItemHeaderList();
bool isSinglePath(CFP_TreeNode *tRoot);
void destroy();
friend class CFP_Growth;
};
#endif//FP_Tree.cpp
/**
* Created by xujin on 2014/12/4.
All Rights Reserved,but you can use this program.
*/
#include<algorithm>
#include"FP_Tree.h"
bool cmp(ItemSupport &a,ItemSupport &b)
{
return a.m_nSupportCount<b.m_nSupportCount;
}
void CFP_Tree::sortMapItemHeaderList()
{
vector<ItemSupport>tItemSupportSet;
for(map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.begin();iter!=m_mapItemHeaderList.end();++iter)
{
tItemSupportSet.push_back(ItemSupport(iter->first,iter->second.m_nSupportCount));
}
sort(tItemSupportSet.begin(),tItemSupportSet.end(),cmp);
m_vecItemSupportSet.clear();
for(vector<ItemSupport>::iterator iter=tItemSupportSet.begin();iter!=tItemSupportSet.end();++iter)
{
// cout<<"--->("<<iter->m_ITItemName<<","<<iter->m_nSupportCount<<")"<<endl;
m_vecItemSupportSet.push_back(*iter);
}
}
CFP_Tree::CFP_Tree(CTransactionSet &tTranSet,double tMinCon,double tMinSup,int tCount)
{
this->m_mapItemHeaderList.clear();
this->m_vecItemSupportSet.clear();
this->m_dMinConfidence=tMinCon;
this->m_dMinSupport =tMinSup;
this->m_nSize=tTranSet.getSize();
this->m_nMinConfidence = (this->m_dMinConfidence)*(this->m_nSize);
this->m_nMinSupport=(this->m_dMinSupport)*(this->m_nSize);
this->m_pCFP_TreeRoot=new CFP_TreeNode(tCount);
for(vector<CTransaction>::iterator iter=tTranSet.getVeCTransaction().begin();iter!=tTranSet.getVeCTransaction().end();++iter)
{
for(vector<string>::iterator strIter=iter->getVecItem().begin();strIter!=iter->getVecItem().end();++strIter)
{
this->addItem(*strIter,1);
}
}
this->sortTransactionSet(tTranSet);
//cout<<"**********CFP_Tree::m_mapItemHeaderList.size()="<<CFP_Tree::m_mapItemHeaderList.size()<<endl;
this->eraseInfrequent1ItemSet();
this->sortMapItemHeaderList();
//cout<<"**********CFP_Tree::m_mapItemHeaderList.size()="<<CFP_Tree::m_mapItemHeaderList.size()<<endl;
}
CFP_Tree::CFP_Tree(double tMinCon,double tMinSup,int tSize)
{
this->m_mapItemHeaderList.clear();
this->m_vecItemSupportSet.clear();
this->m_dMinConfidence=tMinCon;
this->m_dMinSupport =tMinSup;
this->m_nSize=tSize;
this->m_nMinConfidence = (this->m_dMinConfidence)*tSize;
this->m_nMinSupport=(this->m_dMinSupport)*tSize;
this->m_pCFP_TreeRoot=new CFP_TreeNode();
}
void CFP_Tree::addItem(ItemType tItem,int tCount)
{
map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.find(tItem);
if(iter!=m_mapItemHeaderList.end())
{
// cout<<"&&&&&&&&&&&"<<endl;
iter->second.m_nSupportCount +=tCount;
}
else
{
//cout<<"**********"<<endl;
CItemHeaderNode p(tCount);
// cout<<"**********p.m_nSupportCount"<<p.m_nSupportCount<<endl;
pair<ItemType,CItemHeaderNode> node=make_pair(tItem,p);
m_mapItemHeaderList.insert(node);
}
}
void CFP_Tree::eraseInfrequent1ItemSet()
{
for(map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.begin();iter!=m_mapItemHeaderList.end();)
{
if(iter->second.m_nSupportCount<this->m_nMinSupport)
{
m_mapItemHeaderList.erase(iter);
}
else
{
++iter;
}
}
}
class CTransactionItemSort
{
public:
static bool cmp(ItemType a,ItemType b);
static CFP_Tree *g_pRoot;
//std::sort要求函数对象,或是静态/全局函数指针
//非静态成员函数指针不能直接传递给std::sort
};
CFP_Tree *CTransactionItemSort::g_pRoot=NULL;
bool CTransactionItemSort::cmp(ItemType a,ItemType b)
{
int aCount,bCount;
map<ItemType,CItemHeaderNode>::iterator iter=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.find(a);
if(iter!=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.end())
{
aCount=iter->second.m_nSupportCount;
}
else
{
aCount=-1;
}
iter=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.find(b);
if(iter!=CTransactionItemSort::g_pRoot->m_mapItemHeaderList.end())
{
bCount=iter->second.m_nSupportCount;
}
else
{
bCount=-1;
}
return aCount>bCount;
}
void CFP_Tree::sortTransactionSet(CTransactionSet &tTranSet)
{
CTransactionItemSort::g_pRoot=this;
for(vector<CTransaction>::iterator iter=tTranSet.getVeCTransaction().begin();iter!=tTranSet.getVeCTransaction().end();++iter)
{
sort(iter->getVecItem().begin(),iter->getVecItem().end(),CTransactionItemSort::cmp);//cmp调用出错
}
}
void CFP_Tree::insertFPTree(CFP_TreeNode *tRoot,CTransaction &tTran,int id,int tCount)
{
if(id>=tTran.getVecItem().size()||this->m_mapItemHeaderList.find(tTran.getVecItem()[id])==this->m_mapItemHeaderList.end())
return ;
CFP_TreeNode *pChild;
//cout<<"**********this->m_nChildSize="<<this->m_nChildSize<<endl;
for(int i=0;i<tRoot->m_nChildSize;++i)
{
pChild=tRoot->m_pChildNode[i];
if(pChild!=NULL&&pChild->m_ITItemName==tTran.getVecItem()[id])
{
// cout<<"pChild!=NULL&&pChild->m_ITItemName==tTran.getVecItem()[id]"<<endl;
pChild->m_nSupportCount+=tCount;
this->insertFPTree(pChild,tTran,id+1,tCount);
return ;
}
}
//cout<<"&&&&&&&this->m_nChildSize="<<this->m_nChildSize<<endl;
ItemType item=tTran.getVecItem()[id];
pChild=new CFP_TreeNode(item,tRoot, this->m_mapItemHeaderList[item].m_pFPFirst,tCount);
this->m_mapItemHeaderList[item].m_pFPFirst=pChild;
tRoot->m_pChildNode[tRoot->m_nChildSize]=pChild;
++tRoot->m_nChildSize;
insertFPTree(pChild,tTran,id+1,tCount);
//cout<<"!!!!!!!!!!!!this->m_nChildSize="<<this->m_nChildSize<<endl;
}
void CFP_Tree::insertFPTree(CTransaction &tTran,int id,int tCount)
{
this->insertFPTree(this->m_pCFP_TreeRoot,tTran,id,tCount);
}
void CFP_Tree::DFSPrintPath(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet)
{
if(tRoot->m_nChildSize==0)
{
for(vector<ItemSupport>::iterator iter=tItemSupportSet.begin(); iter!=tItemSupportSet.end();++iter)
cout<<"--->("<<iter->m_ITItemName<<","<<iter->m_nSupportCount<<")";
cout<<endl;
return ;
}
//cout<<"&&&&&&&&this->m_nChildSize="<<this->m_nChildSize<<endl;
for(int i=0;i<tRoot->m_nChildSize;++i)
{
CFP_TreeNode *pChild= tRoot->m_pChildNode[i];
tItemSupportSet.push_back(ItemSupport(pChild->m_ITItemName,pChild->m_nSupportCount));
DFSPrintPath(pChild,tItemSupportSet);
tItemSupportSet.pop_back();
}
}
void CFP_Tree::printPath()
{
vector<ItemSupport>tItemSupportSet;
cout<<"打印FP_Tree树:"<<endl;
this->DFSPrintPath(m_pCFP_TreeRoot,tItemSupportSet);
}
void CFP_Tree::printLinkList(CFP_TreeNode *tRoot)
{
cout<<"--->("<<tRoot->m_ITItemName<<","<<tRoot->m_nSupportCount<<")";
if(tRoot->m_pLinkedNode!=NULL)
printLinkList(tRoot->m_pLinkedNode);
}
void CFP_Tree::printItemHeaderList()
{
cout<<"打印顶点表中每个单链表:"<<endl;
for(map<ItemType,CItemHeaderNode>::iterator iter=m_mapItemHeaderList.begin();iter!=m_mapItemHeaderList.end();++iter)
{
if(iter->second.m_pFPFirst!=NULL)
{
cout<<"("<<iter->first<<","<<iter->second.m_nSupportCount<<") :";
printLinkList(iter->second.m_pFPFirst);
cout<<endl;
}
}
}
void CFP_Tree::destroy(CFP_TreeNode *tRoot)
{
CFP_TreeNode *pChild;
for(int i=0;i<tRoot->m_nChildSize;++i)
{
pChild=tRoot->m_pChildNode[i];
if(pChild!=NULL)
{
destroy(pChild);
}
}
tRoot->m_pFatherNode=NULL;
tRoot->m_pLinkedNode=NULL;
for(int i=0;i<tRoot->m_nChildSize;++i)
{
if(tRoot->m_pChildNode[i]!=NULL)
{
delete tRoot->m_pChildNode[i];
tRoot->m_pChildNode[i]=NULL;
}
}
}
void CFP_Tree::destroy()
{
this->destroy(this->m_pCFP_TreeRoot);
}
bool CFP_Tree::isSinglePath(CFP_TreeNode *tRoot)
{
if(0==tRoot->m_nChildSize)
return true;
else if(tRoot->m_nChildSize>1)
return false;
return isSinglePath(tRoot->m_pChildNode[0]);
}
//FP_Crowth.h
/**
* Created by xujin on 2014/12/4.
All Rights Reserved,but you can use this program.
*/
#ifndef FP_GROWTH_H
#define FP_GROWTH_H
#include"Transaction.h"
#include"TransactionSet.h"
#include"FP_Tree.h"
class CFP_Growth
{
private:
CFP_Tree *m_pCFPTConditionTree;
private:
void initCFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport>& tItemSupportSet);
void printOneFreSet(vector<ItemSupport> &tItemSupportSet);
void findCombine(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet);
public:
CFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport>& tItemSupportSet);
void printPath();
void printItemHeaderList();
};
#endif//FP_Crowth.cpp
/**
* Created by xujin on 2014/12/4.
All Rights Reserved,but you can use this program.
*/
#include"FP_Crowth.h"
#include<algorithm>
using namespace std;
void CFP_Growth::initCFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport>& tItemSupportSet)
{
for(vector<ItemSupport>::iterator iter=tCFPTTree->m_vecItemSupportSet.begin();iter!=tCFPTTree->m_vecItemSupportSet.end();++iter)
{
map<ItemType,CItemHeaderNode>::iterator iterMap=tCFPTTree->m_mapItemHeaderList.find(iter->m_ITItemName);
//创建条件FP_growth树
CFP_Tree * pCFPTConTree =new CFP_Tree(tCFPTTree->m_dMinConfidence,tCFPTTree->m_dMinSupport,tCFPTTree->m_nSize);
for(CFP_TreeNode *next=iterMap->second.m_pFPFirst; next!=NULL; next=next->m_pLinkedNode)
{
CTransaction tran;
CFP_TreeNode *fa=next->m_pFatherNode;
int count=next->m_nSupportCount;
while(fa!=NULL&&!fa->m_ITItemName.empty())
{
tran.addItem(fa->m_ITItemName);
pCFPTConTree->addItem(fa->m_ITItemName,count);
fa=fa->m_pFatherNode;
}
CTransaction reve;
for(vector<string>::reverse_iterator iter=tran.getVecItem().rbegin();iter!=tran.getVecItem().rend();++iter)
{
reve.addItem(*iter);
}
pCFPTConTree->insertFPTree(reve,0,count);
}
pCFPTConTree->sortMapItemHeaderList();
tItemSupportSet.push_back(ItemSupport(iterMap->first,iterMap->second.m_nSupportCount));
new CFP_Growth(pCFPTConTree,tItemSupportSet);
tItemSupportSet.pop_back();
}
}
CFP_Growth::CFP_Growth(CFP_Tree *tCFPTTree,vector<ItemSupport> &tItemSupportSet)
{
this->m_pCFPTConditionTree=tCFPTTree;
if(0==tCFPTTree->m_pCFP_TreeRoot->m_nChildSize)
{
this->printOneFreSet(tItemSupportSet);
return ;
}
else if(tCFPTTree->isSinglePath(tCFPTTree->m_pCFP_TreeRoot))
{
findCombine(tCFPTTree->m_pCFP_TreeRoot->m_pChildNode[0],tItemSupportSet);
return ;
}
else
{
initCFP_Growth(tCFPTTree,tItemSupportSet);
}
}
void CFP_Growth::findCombine(CFP_TreeNode *tRoot,vector<ItemSupport> &tItemSupportSet)
{
if(tRoot==NULL)
{
printOneFreSet(tItemSupportSet);
return ;
}
findCombine(tRoot->m_pChildNode[0],tItemSupportSet);
tItemSupportSet.push_back(ItemSupport(tRoot->m_ITItemName,tRoot->m_nSupportCount));
findCombine(tRoot->m_pChildNode[0],tItemSupportSet);
tItemSupportSet.pop_back();
}
void CFP_Growth::printOneFreSet(vector<ItemSupport> &tItemSupportSet)
{
if(1==tItemSupportSet.size())
return ;
int count=m_pCFPTConditionTree->m_nSize*10;
for(vector<ItemSupport>::reverse_iterator iter=tItemSupportSet.rbegin();iter!=tItemSupportSet.rend();++iter)
{
if(count>iter->m_nSupportCount)
count=iter->m_nSupportCount;
}
if(count<m_pCFPTConditionTree->m_nMinSupport)
return ;
cout<<"{ ";
for(vector<ItemSupport>::reverse_iterator iter=tItemSupportSet.rbegin();iter!=tItemSupportSet.rend();++iter)
{
cout<<iter->m_ITItemName<<" ";
}
cout<<count<<" }";
cout<<endl;
}
void CFP_Growth::printPath()
{
vector<ItemSupport>tItemSupportSet;
cout<<"打印FP_Tree树:"<<endl;
this->m_pCFPTConditionTree->DFSPrintPath(this->m_pCFPTConditionTree->m_pCFP_TreeRoot,tItemSupportSet);
}
void CFP_Growth::printItemHeaderList()
{
cout<<"打印顶点表中每个单链表:"<<endl;
for(map<ItemType,CItemHeaderNode>::iterator iter=this->m_pCFPTConditionTree->m_mapItemHeaderList.begin();iter!=this->m_pCFPTConditionTree->m_mapItemHeaderList.end();++iter)
{
if(iter->second.m_pFPFirst!=NULL)
{
cout<<"("<<iter->first<<","<<iter->second.m_nSupportCount<<") :";
this->m_pCFPTConditionTree->printLinkList(iter->second.m_pFPFirst);
cout<<endl;
}
}
}
FP-growth算法比Apriori算法快一个数量级,在空间复杂度方面也比Apriori也有数量级级别的优化。但是对于海量数据,FP-growth的时空复杂度仍然很高,可以采用的改进方法包括数据库划分,数据采样等等。
由于时间有限,在写博文的过程中参考过一些文献,在此表示感谢;同时鉴于水平原因,你难免有不足之处,欢迎斧正!
标签:des style blog http io ar color 使用 sp
原文地址:http://blog.csdn.net/xiaofengcanyuexj/article/details/41656335